File indexing completed on 2024-04-28 04:32:47

0001 /*
0002     SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
0003 
0004     SPDX-License-Identifier: GPL-2.0-or-later
0005 */
0006 
0007 #include "textpage.h"
0008 #include "textpage_p.h"
0009 
0010 #include <QDebug>
0011 
0012 #include "area.h"
0013 #include "debug_p.h"
0014 #include "misc.h"
0015 #include "page.h"
0016 #include "page_p.h"
0017 #include <unordered_set>
0018 
0019 #include <cstring>
0020 
0021 #include <QVarLengthArray>
0022 #include <QtAlgorithms>
0023 
0024 using namespace Okular;
0025 
0026 // Many of the strings are being reused; especially
0027 // those less than 2 letters are very common
0028 // Use the implicit shared bits from QString to
0029 // not keep multiple same strings around, but just up the
0030 // refcount a bit
0031 // The main reason for '2' is that most calls here happens
0032 // in auxillary threads that are following a document
0033 // and keeping the pool thread_local gives quite a bit
0034 // of advantage here.
0035 // Some calls though comes from the main thread, so we
0036 // shouldn't keep all the long strings allocated in the main
0037 // thread around forever.
0038 // '2' has been chosen by random testing, and guesswork.
0039 static QString fromPool(const QString &str)
0040 {
0041     if (str.length() > 2) {
0042         return str;
0043     }
0044 
0045     thread_local std::unordered_set<QString> pool;
0046     auto [iterator, success] = pool.insert(str);
0047     return *iterator;
0048 }
0049 
0050 class SearchPoint
0051 {
0052 public:
0053     SearchPoint()
0054         : offset_begin(-1)
0055         , offset_end(-1)
0056     {
0057     }
0058 
0059     /** The TextEntity containing the first character of the match. */
0060     TextEntity::List::ConstIterator it_begin;
0061 
0062     /** The TextEntity containing the last character of the match. */
0063     TextEntity::List::ConstIterator it_end;
0064 
0065     /** The index of the first character of the match in (*it_begin)->text().
0066      *  Satisfies 0 <= offset_begin < (*it_begin)->text().length().
0067      */
0068     int offset_begin;
0069 
0070     /** One plus the index of the last character of the match in (*it_end)->text().
0071      *  Satisfies 0 < offset_end <= (*it_end)->text().length().
0072      */
0073     int offset_end;
0074 };
0075 
0076 /* text comparison functions */
0077 
0078 static bool CaseInsensitiveCmpFn(QStringView from, QStringView to)
0079 {
0080 #ifdef DEBUG_TEXTPAGE
0081     qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
0082 #endif
0083     return from.compare(to, Qt::CaseInsensitive) == 0;
0084 }
0085 
0086 static bool CaseSensitiveCmpFn(QStringView from, QStringView to)
0087 {
0088 #ifdef DEBUG_TEXTPAGE
0089     qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
0090 #endif
0091     return from.compare(to, Qt::CaseSensitive) == 0;
0092 }
0093 
0094 /**
0095  * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
0096  * overlap within @p threshold percent, i. e. iff the ratio of the length of the
0097  * intersection of the segments to the length of the shortest of the two input segments
0098  * is not smaller than the threshold.
0099  */
0100 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
0101 {
0102     // check if one consumes another fully (speed optimization)
0103 
0104     if (left1 <= left2 && right1 >= right2) {
0105         return true;
0106     }
0107 
0108     if (left1 >= left2 && right1 <= right2) {
0109         return true;
0110     }
0111 
0112     // check if there is overlap above threshold
0113     if (right2 >= left1 && right1 >= left2) {
0114         double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
0115 
0116         double length1 = right1 - left1, length2 = right2 - left2;
0117 
0118         return overlap * 100 >= threshold * qMin(length1, length2);
0119     }
0120 
0121     return false;
0122 }
0123 
0124 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
0125 {
0126     return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
0127 }
0128 
0129 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
0130 {
0131     return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
0132 }
0133 
0134 TextEntity::TextEntity(const QString &text, const NormalizedRect &area)
0135     : m_text(fromPool(text))
0136     , m_area(area)
0137 {
0138 }
0139 
0140 TextEntity::~TextEntity() = default;
0141 
0142 QString TextEntity::text() const
0143 {
0144     return m_text;
0145 }
0146 
0147 NormalizedRect TextEntity::area() const
0148 {
0149     return m_area;
0150 }
0151 
0152 NormalizedRect TextEntity::transformedArea(const QTransform &matrix) const
0153 {
0154     NormalizedRect transformed_area = m_area;
0155     transformed_area.transform(matrix);
0156     return transformed_area;
0157 }
0158 
0159 TextPagePrivate::TextPagePrivate()
0160     : m_page(nullptr)
0161 {
0162 }
0163 
0164 TextPagePrivate::~TextPagePrivate()
0165 {
0166     qDeleteAll(m_searchPoints);
0167 }
0168 
0169 TextPage::TextPage()
0170     : d(new TextPagePrivate())
0171 {
0172 }
0173 
0174 TextPage::TextPage(const TextEntity::List &words)
0175     : d(new TextPagePrivate())
0176 {
0177     d->m_words = words;
0178 }
0179 
0180 TextPage::~TextPage()
0181 {
0182     delete d;
0183 }
0184 
0185 void TextPage::append(const QString &text, const NormalizedRect &area)
0186 {
0187     if (!text.isEmpty()) {
0188         if (!d->m_words.isEmpty()) {
0189             TextEntity &lastEntity = d->m_words.last();
0190             const QString concatText = lastEntity.text() + text.normalized(QString::NormalizationForm_KC);
0191             if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
0192                 // If this happens it means that the new text + old one have combined, for example A and ◌̊  form Å
0193                 NormalizedRect newArea = area | lastEntity.area();
0194                 d->m_words.removeLast();
0195                 d->m_words.append(TextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
0196                 return;
0197             }
0198         }
0199 
0200         d->m_words.append(TextEntity(text.normalized(QString::NormalizationForm_KC), area));
0201     }
0202 }
0203 
0204 struct WordWithCharacters {
0205     WordWithCharacters(const TextEntity &w, const TextEntity::List &c)
0206         : word(w)
0207         , characters(c)
0208     {
0209     }
0210 
0211     inline QString text() const
0212     {
0213         return word.text();
0214     }
0215 
0216     inline NormalizedRect area() const
0217     {
0218         return word.area();
0219     }
0220 
0221     TextEntity word;
0222     TextEntity::List characters;
0223 };
0224 typedef QList<WordWithCharacters> WordsWithCharacters;
0225 
0226 /**
0227  * We will divide the whole page in some regions depending on the horizontal and
0228  * vertical spacing among different regions. Each region will have an area and an
0229  * associated WordsWithCharacters in sorted order.
0230  */
0231 class RegionText
0232 {
0233 public:
0234     RegionText() {};
0235 
0236     RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
0237         : m_region_wordWithCharacters(wordsWithCharacters)
0238         , m_area(area)
0239     {
0240     }
0241 
0242     inline QString string() const
0243     {
0244         QString res;
0245         for (const WordWithCharacters &word : m_region_wordWithCharacters) {
0246             res += word.text();
0247         }
0248         return res;
0249     }
0250 
0251     inline WordsWithCharacters text() const
0252     {
0253         return m_region_wordWithCharacters;
0254     }
0255 
0256     inline QRect area() const
0257     {
0258         return m_area;
0259     }
0260 
0261     inline void setArea(const QRect area)
0262     {
0263         m_area = area;
0264     }
0265 
0266     inline void setText(const WordsWithCharacters &wordsWithCharacters)
0267     {
0268         m_region_wordWithCharacters = wordsWithCharacters;
0269     }
0270 
0271 private:
0272     WordsWithCharacters m_region_wordWithCharacters;
0273     QRect m_area;
0274 };
0275 
0276 RegularAreaRect *TextPage::textArea(TextSelection *sel) const
0277 {
0278     if (d->m_words.isEmpty()) {
0279         return new RegularAreaRect();
0280     }
0281 
0282     /**
0283         It works like this:
0284         There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
0285         rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
0286         points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
0287         for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
0288         of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
0289         1. (rx,ry)x(1,ty)
0290         2. (0,ty)x(1,uy)
0291         3. (0,uy)x(vx,vy)
0292 
0293         To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
0294         or that has a left border >= cx and bottom border >= cy.
0295     */
0296     RegularAreaRect *ret = new RegularAreaRect;
0297 
0298     PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
0299     const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
0300     const double scaleX = d->m_page->width();
0301     const double scaleY = d->m_page->height();
0302 
0303     NormalizedPoint startC = sel->start();
0304     NormalizedPoint endC = sel->end();
0305     NormalizedPoint temp;
0306 
0307     // if startPoint is right to endPoint swap them
0308     if (startC.x > endC.x) {
0309         temp = startC;
0310         startC = endC;
0311         endC = temp;
0312     }
0313 
0314     // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
0315     const NormalizedRect boundingRect = d->m_page->boundingBox();
0316     const QRect content = boundingRect.geometry(scaleX, scaleY);
0317     const double minX = content.left();
0318     const double maxX = content.right();
0319     const double minY = content.top();
0320     const double maxY = content.bottom();
0321 
0322     /**
0323      * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
0324      * the endRectangle. We have four cases:
0325      *
0326      * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
0327      * and endPoint are outof the bounding rect (do not intersect)
0328      *
0329      * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
0330      *
0331      * Case 2(a): find the rectangle which contains start and endpoint and having some
0332      * TextEntity
0333      *
0334      * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
0335      * TextEntity within the rect made by startPoint and endPoint
0336      *
0337      * Case 3: Now, we may have two type of selection.
0338      * 1. startpoint is left-top of start_end and endpoint is right-bottom
0339      * 2. startpoint is left-bottom of start_end and endpoint is top-right
0340      *
0341      * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
0342      * text within them. so, we need to search for the best suitable textposition for start and end.
0343      *
0344      * Case 3(a): We search the nearest rectangle consisting of some
0345      * TinyTextEntity right to or bottom of the startPoint for selection 01.
0346      * And, for selection 02, we have to search for right and top
0347      *
0348      * Case 3(b): For endpoint, we have to find the point top of or left to
0349      * endpoint if we have selection 01.
0350      * Otherwise, the search will be left and bottom
0351      */
0352 
0353     // we know that startC.x > endC.x, we need to decide which is top and which is bottom
0354     const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
0355 
0356     // Case 1(a)
0357     if (!boundingRect.intersects(start_end)) {
0358         return ret;
0359     } else {
0360         // case 1(b)
0361         /**
0362             note that, after swapping of start and end, we know that,
0363             start is always left to end. but, we cannot say start is
0364             positioned upper than end.
0365         **/
0366         // if start is left to content rect take it to content rect boundary
0367         if (startC.x * scaleX < minX) {
0368             startC.x = minX / scaleX;
0369         }
0370         if (endC.x * scaleX > maxX) {
0371             endC.x = maxX / scaleX;
0372         }
0373 
0374         // if start is top to end (selection type 01)
0375         if (startC.y * scaleY < minY) {
0376             startC.y = minY / scaleY;
0377         }
0378         if (endC.y * scaleY > maxY) {
0379             endC.y = maxY / scaleY;
0380         }
0381 
0382         // if start is bottom to end (selection type 02)
0383         if (startC.y * scaleY > maxY) {
0384             startC.y = maxY / scaleY;
0385         }
0386         if (endC.y * scaleY < minY) {
0387             endC.y = minY / scaleY;
0388         }
0389     }
0390 
0391     TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
0392     TextEntity::List::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
0393     const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
0394 
0395     NormalizedRect tmp;
0396     // case 2(a)
0397     for (; it != itEnd; ++it) {
0398         tmp = it->area();
0399         if (tmp.contains(startC.x, startC.y)) {
0400             start = it;
0401         }
0402         if (tmp.contains(endC.x, endC.y)) {
0403             end = it;
0404         }
0405     }
0406 
0407     // case 2(b)
0408     it = tmpIt;
0409     if (start == it && end == itEnd) {
0410         for (; it != itEnd; ++it) {
0411             // is there any text rectangle within the start_end rect
0412             tmp = it->area();
0413             if (start_end.intersects(tmp)) {
0414                 break;
0415             }
0416         }
0417 
0418         // we have searched every text entities, but none is within the rectangle created by start and end
0419         // so, no selection should be done
0420         if (it == itEnd) {
0421             return ret;
0422         }
0423     }
0424     it = tmpIt;
0425     bool selection_two_start = false;
0426 
0427     // case 3.a
0428     if (start == it) {
0429         NormalizedRect rect;
0430 
0431         // selection type 01
0432         if (startC.y <= endC.y) {
0433             for (; it != itEnd; ++it) {
0434                 rect = it->area();
0435                 bool flagV = !rect.isBottom(startC);
0436 
0437                 if (flagV && rect.isRight(startC)) {
0438                     start = it;
0439                     break;
0440                 }
0441             }
0442         }
0443 
0444         // selection type 02
0445         else {
0446             selection_two_start = true;
0447             int distance = scaleX + scaleY + 100;
0448 
0449             for (; it != itEnd; ++it) {
0450                 rect = it->area();
0451 
0452                 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
0453                     QRect entRect = rect.geometry(scaleX, scaleY);
0454                     int xdist, ydist;
0455                     xdist = entRect.center().x() - startC.x * scaleX;
0456                     ydist = entRect.center().y() - startC.y * scaleY;
0457 
0458                     // make them positive
0459                     if (xdist < 0) {
0460                         xdist = -xdist;
0461                     }
0462                     if (ydist < 0) {
0463                         ydist = -ydist;
0464                     }
0465 
0466                     if ((xdist + ydist) < distance) {
0467                         distance = xdist + ydist;
0468                         start = it;
0469                     }
0470                 }
0471             }
0472         }
0473     }
0474 
0475     // case 3.b
0476     if (end == itEnd) {
0477         it = tmpIt;
0478         itEnd = itEnd - 1;
0479 
0480         NormalizedRect rect;
0481 
0482         if (startC.y <= endC.y) {
0483             for (; itEnd >= it; itEnd--) {
0484                 rect = itEnd->area();
0485                 bool flagV = !rect.isTop(endC);
0486 
0487                 if (flagV && rect.isLeft(endC)) {
0488                     end = itEnd;
0489                     break;
0490                 }
0491             }
0492         }
0493 
0494         else {
0495             int distance = scaleX + scaleY + 100;
0496             for (; itEnd >= it; itEnd--) {
0497                 rect = itEnd->area();
0498 
0499                 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
0500                     QRect entRect = rect.geometry(scaleX, scaleY);
0501                     int xdist, ydist;
0502                     xdist = entRect.center().x() - endC.x * scaleX;
0503                     ydist = entRect.center().y() - endC.y * scaleY;
0504 
0505                     // make them positive
0506                     if (xdist < 0) {
0507                         xdist = -xdist;
0508                     }
0509                     if (ydist < 0) {
0510                         ydist = -ydist;
0511                     }
0512 
0513                     if ((xdist + ydist) < distance) {
0514                         distance = xdist + ydist;
0515                         end = itEnd;
0516                     }
0517                 }
0518             }
0519         }
0520     }
0521 
0522     /* if start and end in selection 02 are in the same column, and we
0523      start at an empty space we have to remove the selection of last
0524      character
0525     */
0526     if (selection_two_start) {
0527         if (start > end) {
0528             start = start - 1;
0529         }
0530     }
0531 
0532     // if start is less than end swap them
0533     if (start > end) {
0534         it = start;
0535         start = end;
0536         end = it;
0537     }
0538 
0539     // removes the possibility of crash, in case none of 1 to 3 is true
0540     if (end == d->m_words.constEnd()) {
0541         end--;
0542     }
0543 
0544     for (; start <= end; start++) {
0545         ret->appendShape(start->transformedArea(matrix), side);
0546     }
0547 
0548     return ret;
0549 }
0550 
0551 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
0552 {
0553     SearchDirection dir = direct;
0554     // invalid search request
0555     if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
0556         return nullptr;
0557     }
0558     TextEntity::List::ConstIterator start;
0559     int start_offset = 0;
0560     TextEntity::List::ConstIterator end;
0561     const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
0562     if (sIt == d->m_searchPoints.constEnd()) {
0563         // if no previous run of this search is found, then set it to start
0564         // from the beginning (respecting the search direction)
0565         if (dir == NextResult) {
0566             dir = FromTop;
0567         } else if (dir == PreviousResult) {
0568             dir = FromBottom;
0569         }
0570     }
0571     bool forward = true;
0572     switch (dir) {
0573     case FromTop:
0574         start = d->m_words.constBegin();
0575         start_offset = 0;
0576         end = d->m_words.constEnd();
0577         break;
0578     case FromBottom:
0579         start = d->m_words.constEnd();
0580         start_offset = 0;
0581         end = d->m_words.constBegin();
0582         forward = false;
0583         break;
0584     case NextResult:
0585         start = (*sIt)->it_end;
0586         start_offset = (*sIt)->offset_end;
0587         end = d->m_words.constEnd();
0588         break;
0589     case PreviousResult:
0590         start = (*sIt)->it_begin;
0591         start_offset = (*sIt)->offset_begin;
0592         end = d->m_words.constBegin();
0593         forward = false;
0594         break;
0595     };
0596     RegularAreaRect *ret = nullptr;
0597     const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
0598     if (forward) {
0599         ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
0600     } else {
0601         ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
0602     }
0603     return ret;
0604 }
0605 
0606 // hyphenated '-' must be at the end of a word, so hyphenation means
0607 // we have a '-' just followed by a '\n' character
0608 // check if the string contains a '-' character
0609 // if the '-' is the last entry
0610 static int stringLengthAdaptedWithHyphen(const QString &str, TextEntity::List::ConstIterator it, TextEntity::List::ConstIterator textListEnd)
0611 {
0612     const int len = str.length();
0613 
0614     // hyphenated '-' must be at the end of a word, so hyphenation means
0615     // we have a '-' just followed by a '\n' character
0616     // check if the string contains a '-' character
0617     // if the '-' is the last entry
0618     if (str.endsWith(QLatin1Char('-'))) {
0619         // validity chek of it + 1
0620         if ((it + 1) != textListEnd) {
0621             // 1. if the next character is '\n'
0622             const QString &lookahedStr = (it + 1)->text();
0623             if (lookahedStr.startsWith(QLatin1Char('\n'))) {
0624                 return len - 1;
0625             }
0626 
0627             // 2. if the next word is in a different line or not
0628             const NormalizedRect &hyphenArea = it->area();
0629             const NormalizedRect &lookaheadArea = (it + 1)->area();
0630 
0631             // lookahead to check whether both the '-' rect and next character rect overlap
0632             if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
0633                 return len - 1;
0634             }
0635         }
0636     }
0637     // else if it is the second last entry - for example in pdf format
0638     else if (str.endsWith(QLatin1String("-\n"))) {
0639         return len - 2;
0640     }
0641 
0642     return len;
0643 }
0644 
0645 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
0646 {
0647     PagePrivate *pagePrivate = PagePrivate::get(m_page);
0648     const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
0649     RegularAreaRect *ret = new RegularAreaRect;
0650 
0651     for (TextEntity::List::ConstIterator it = sp->it_begin;; it++) {
0652         const TextEntity &curEntity = *it;
0653         ret->append(curEntity.transformedArea(matrix));
0654 
0655         if (it == sp->it_end) {
0656             break;
0657         }
0658     }
0659 
0660     ret->simplify();
0661     return ret;
0662 }
0663 
0664 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
0665 {
0666     // normalize query search all unicode (including glyphs)
0667     const QString query = _query.normalized(QString::NormalizationForm_KC);
0668 
0669     // j is the current position in our query
0670     // queryLeft is the length of the query we have left to match
0671     int j = 0, queryLeft = query.length();
0672 
0673     TextEntity::List::ConstIterator it = start;
0674     int offset = start_offset;
0675 
0676     TextEntity::List::ConstIterator it_begin = TextEntity::List::ConstIterator();
0677     int offset_begin = 0; // dummy initial value to suppress compiler warnings
0678 
0679     while (it != end) {
0680         const TextEntity &curEntity = *it;
0681         const QString &str = curEntity.text();
0682         const int strLen = str.length();
0683         const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
0684         // adjustedLen <= strLen
0685 
0686         if (offset >= strLen) {
0687             it++;
0688             offset = 0;
0689             continue;
0690         }
0691 
0692         if (it_begin == TextEntity::List::ConstIterator()) {
0693             it_begin = it;
0694             offset_begin = offset;
0695         }
0696 
0697         // Let the user write the hyphen or not when searching for text
0698         int matchedLen = -1;
0699         for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
0700             // we have equal (or less than) area of the query left as the length of the current
0701             // entity
0702             const int min = qMin(queryLeft, matchingLen - offset);
0703             if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) {
0704                 matchedLen = min;
0705                 break;
0706             }
0707         }
0708 
0709         if (matchedLen == -1) {
0710             // we have not matched
0711             // this means we do not have a complete match
0712             // we need to get back to query start
0713             // and continue the search from this place
0714 #ifdef DEBUG_TEXTPAGE
0715             qCDebug(OkularCoreDebug) << "\tnot matched";
0716 #endif
0717             j = 0;
0718             queryLeft = query.length();
0719             it = it_begin;
0720             offset = offset_begin + 1;
0721             it_begin = TextEntity::List::ConstIterator();
0722         } else {
0723             // we have a match
0724             // move the current position in the query
0725             // to the position after the length of this string
0726             // we matched
0727             // subtract the length of the current entity from
0728             // the left length of the query
0729 #ifdef DEBUG_TEXTPAGE
0730             qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
0731 #endif
0732             j += matchedLen;
0733             queryLeft -= matchedLen;
0734 
0735             if (queryLeft == 0) {
0736                 // save or update the search point for the current searchID
0737                 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0738                 if (sIt == m_searchPoints.end()) {
0739                     sIt = m_searchPoints.insert(searchID, new SearchPoint);
0740                 }
0741                 SearchPoint *sp = *sIt;
0742                 sp->it_begin = it_begin;
0743                 sp->it_end = it;
0744                 sp->offset_begin = offset_begin;
0745                 sp->offset_end = offset + matchedLen;
0746                 return searchPointToArea(sp);
0747             }
0748 
0749             it++;
0750             offset = 0;
0751         }
0752     }
0753     // end of loop - it means that we've ended the textentities
0754 
0755     const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0756     if (sIt != m_searchPoints.end()) {
0757         SearchPoint *sp = *sIt;
0758         m_searchPoints.erase(sIt);
0759         delete sp;
0760     }
0761     return nullptr;
0762 }
0763 
0764 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextEntity::List::ConstIterator start, int start_offset, TextEntity::List::ConstIterator end)
0765 {
0766     // normalize query to search all unicode (including glyphs)
0767     const QString query = _query.normalized(QString::NormalizationForm_KC);
0768 
0769     // j is the current position in our query
0770     // len is the length of the string in TextEntity
0771     // queryLeft is the length of the query we have left
0772     int j = query.length(), queryLeft = query.length();
0773 
0774     TextEntity::List::ConstIterator it = start;
0775     int offset = start_offset;
0776 
0777     TextEntity::List::ConstIterator it_begin = TextEntity::List::ConstIterator();
0778     int offset_begin = 0; // dummy initial value to suppress compiler warnings
0779 
0780     while (true) {
0781         if (offset <= 0) {
0782             if (it == end) {
0783                 break;
0784             }
0785             it--;
0786         }
0787 
0788         const TextEntity &curEntity = *it;
0789         const QString &str = curEntity.text();
0790         const int strLen = str.length();
0791         const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
0792         // adjustedLen <= strLen
0793 
0794         if (offset <= 0) {
0795             offset = strLen;
0796         }
0797 
0798         if (it_begin == TextEntity::List::ConstIterator()) {
0799             it_begin = it;
0800             offset_begin = offset;
0801         }
0802 
0803         // Let the user write the hyphen or not when searching for text
0804         int matchedLen = -1;
0805         // we have equal (or less than) area of the query left as the length of the current
0806         // entity
0807         for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
0808             const int hyphenOffset = (strLen - matchingLen);
0809             const int min = qMin(queryLeft + hyphenOffset, offset);
0810             if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) {
0811                 matchedLen = min - hyphenOffset;
0812                 break;
0813             }
0814         }
0815 
0816         if (matchedLen == -1) {
0817             // we have not matched
0818             // this means we do not have a complete match
0819             // we need to get back to query start
0820             // and continue the search from this place
0821 #ifdef DEBUG_TEXTPAGE
0822             qCDebug(OkularCoreDebug) << "\tnot matched";
0823 #endif
0824 
0825             j = query.length();
0826             queryLeft = query.length();
0827             it = it_begin;
0828             offset = offset_begin - 1;
0829             it_begin = TextEntity::List::ConstIterator();
0830         } else {
0831             // we have a match
0832             // move the current position in the query
0833             // to the position after the length of this string
0834             // we matched
0835             // subtract the length of the current entity from
0836             // the left length of the query
0837 #ifdef DEBUG_TEXTPAGE
0838             qCDebug(OkularCoreDebug) << "\tmatched";
0839 #endif
0840             j -= matchedLen;
0841             queryLeft -= matchedLen;
0842 
0843             if (queryLeft == 0) {
0844                 // save or update the search point for the current searchID
0845                 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0846                 if (sIt == m_searchPoints.end()) {
0847                     sIt = m_searchPoints.insert(searchID, new SearchPoint);
0848                 }
0849                 SearchPoint *sp = *sIt;
0850                 sp->it_begin = it;
0851                 sp->it_end = it_begin;
0852                 sp->offset_begin = offset - matchedLen;
0853                 sp->offset_end = offset_begin;
0854                 return searchPointToArea(sp);
0855             }
0856 
0857             offset = 0;
0858         }
0859     }
0860     // end of loop - it means that we've ended the textentities
0861 
0862     const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0863     if (sIt != m_searchPoints.end()) {
0864         SearchPoint *sp = *sIt;
0865         m_searchPoints.erase(sIt);
0866         delete sp;
0867     }
0868     return nullptr;
0869 }
0870 
0871 QString TextPage::text(const RegularAreaRect *area) const
0872 {
0873     return text(area, AnyPixelTextAreaInclusionBehaviour);
0874 }
0875 
0876 QString TextPage::text(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
0877 {
0878     if (area && area->isNull()) {
0879         return QString();
0880     }
0881 
0882     TextEntity::List::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
0883     QString ret;
0884     if (area) {
0885         for (; it != itEnd; ++it) {
0886             if (b == AnyPixelTextAreaInclusionBehaviour) {
0887                 if (area->intersects(it->area())) {
0888                     ret += it->text();
0889                 }
0890             } else {
0891                 NormalizedPoint center = it->area().center();
0892                 if (area->contains(center.x, center.y)) {
0893                     ret += it->text();
0894                 }
0895             }
0896         }
0897     } else {
0898         for (; it != itEnd; ++it) {
0899             ret += it->text();
0900         }
0901     }
0902     return ret;
0903 }
0904 
0905 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
0906 {
0907     QRect firstArea = first.area().roundedGeometry(1000, 1000);
0908     QRect secondArea = second.area().roundedGeometry(1000, 1000);
0909 
0910     return firstArea.left() < secondArea.left();
0911 }
0912 
0913 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
0914 {
0915     const QRect firstArea = first.area().roundedGeometry(1000, 1000);
0916     const QRect secondArea = second.area().roundedGeometry(1000, 1000);
0917 
0918     return firstArea.top() < secondArea.top();
0919 }
0920 
0921 /**
0922  * Sets a new world list. Deleting the contents of the old one
0923  */
0924 void TextPagePrivate::setWordList(const TextEntity::List &list)
0925 {
0926     m_words = list;
0927 }
0928 
0929 /**
0930  * Remove all the spaces in between texts. It will make all the generators
0931  * same, whether they save spaces(like pdf) or not(like djvu).
0932  */
0933 static void removeSpace(TextEntity::List *words)
0934 {
0935     TextEntity::List::Iterator it = words->begin();
0936     const QString str(QLatin1Char(' '));
0937 
0938     while (it != words->end()) {
0939         if (it->text() == str) {
0940             it = words->erase(it);
0941         } else {
0942             ++it;
0943         }
0944     }
0945 }
0946 
0947 /**
0948  * We will read the TinyTextEntity from characters and try to create words from there.
0949  * Note: characters might be already characters for some generators, but we will keep
0950  * the nomenclature characters for the generator produced data. The resulting
0951  * WordsWithCharacters memory has to be managed by the caller, both the
0952  * WordWithCharacters::word and WordWithCharacters::characters contents
0953  */
0954 static WordsWithCharacters makeWordFromCharacters(const TextEntity::List &characters, int pageWidth, int pageHeight)
0955 {
0956     /**
0957      * We will traverse characters and try to create words from the TinyTextEntities in it.
0958      * We will search TinyTextEntity blocks and merge them until we get a
0959      * space between two consecutive TinyTextEntities. When we get a space
0960      * we can take it as a end of word. Then we store the word as a TinyTextEntity
0961      * and keep it in newList.
0962 
0963      * We create a RegionText named regionWord that contains the word and the characters associated with it and
0964      * a rectangle area of the element in newList.
0965 
0966      */
0967     WordsWithCharacters wordsWithCharacters;
0968 
0969     TextEntity::List::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
0970     int newLeft, newRight, newTop, newBottom;
0971 
0972     for (; it != itEnd; it++) {
0973         QString textString = it->text();
0974         QString newString;
0975         QRect lineArea = it->area().roundedGeometry(pageWidth, pageHeight);
0976         QRect elementArea;
0977         TextEntity::List wordCharacters;
0978         tmpIt = it;
0979         int space = 0;
0980 
0981         while (!space) {
0982             if (!textString.isEmpty()) {
0983                 newString.append(textString);
0984 
0985                 // when textString is the start of the word
0986                 if (tmpIt == it) {
0987                     NormalizedRect newRect(lineArea, pageWidth, pageHeight);
0988                     wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
0989                 } else {
0990                     NormalizedRect newRect(elementArea, pageWidth, pageHeight);
0991                     wordCharacters.append(TextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
0992                 }
0993             }
0994 
0995             ++it;
0996 
0997             /*
0998              we must have to put this line before the if condition of it==itEnd
0999              otherwise the last character can be missed
1000              */
1001             if (it == itEnd) {
1002                 break;
1003             }
1004             elementArea = it->area().roundedGeometry(pageWidth, pageHeight);
1005             if (!doesConsumeY(elementArea, lineArea, 60)) {
1006                 --it;
1007                 break;
1008             }
1009 
1010             const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1011             const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1012 
1013             space = elementArea.left() - lineArea.right();
1014 
1015             if (space != 0) {
1016                 it--;
1017                 break;
1018             }
1019 
1020             newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1021             newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1022             newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1023             newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1024 
1025             lineArea.setLeft(newLeft);
1026             lineArea.setTop(newTop);
1027             lineArea.setWidth(newRight - newLeft);
1028             lineArea.setHeight(newBottom - newTop);
1029 
1030             textString = it->text();
1031         }
1032 
1033         // if newString is not empty, save it
1034         if (!newString.isEmpty()) {
1035             const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1036             TextEntity word = TextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1037             wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1038         }
1039 
1040         if (it == itEnd) {
1041             break;
1042         }
1043     }
1044 
1045     wordsWithCharacters.shrink_to_fit();
1046     return wordsWithCharacters;
1047 }
1048 
1049 /**
1050  * Create Lines from the words and sort them
1051  */
1052 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1053 {
1054     /**
1055      * We cannot assume that the generator will give us texts in the right order.
1056      * We can only assume that we will get texts in the page and their bounding
1057      * rectangle. The texts can be character, word, half-word anything.
1058      * So, we need to:
1059      **
1060      * 1. Sort rectangles/boxes containing texts by y0(top)
1061      * 2. Create textline where there is y overlap between TinyTextEntity 's
1062      * 3. Within each line sort the TinyTextEntity 's by x0(left)
1063      */
1064 
1065     QList<QPair<WordsWithCharacters, QRect>> lines;
1066 
1067     /*
1068      Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1069      not contain same pointers for all the TinyTextEntity.
1070      */
1071     QList<WordWithCharacters> words = wordsTmp;
1072 
1073     // Step 1
1074     std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1075 
1076     // Step 2
1077     QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1078 
1079     // for every non-space texts(characters/words) in the textList
1080     for (; it != itEnd; it++) {
1081         const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1082         bool found = false;
1083 
1084         for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1085             /* the line area which will be expanded
1086                line_rects is only necessary to preserve the topmin and bottommax of all
1087                the texts in the line, left and right is not necessary at all
1088             */
1089             QRect &lineArea = linesI.second;
1090             const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1091             const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1092 
1093             /*
1094                if the new text and the line has y overlapping parts of more than 70%,
1095                the text will be added to this line
1096              */
1097             if (doesConsumeY(elementArea, lineArea, 70)) {
1098                 WordsWithCharacters &line = linesI.first;
1099                 line.append(*it);
1100 
1101                 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1102                 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1103                 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1104                 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1105 
1106                 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1107                 found = true;
1108             }
1109 
1110             if (found) {
1111                 break;
1112             }
1113         }
1114 
1115         /* when we have found a new line create a new TextList containing
1116            only one element and append it to the lines
1117          */
1118         if (!found) {
1119             lines.append(QPair<WordsWithCharacters, QRect>({*it}, elementArea));
1120         }
1121     }
1122 
1123     // Step 3
1124     for (QPair<WordsWithCharacters, QRect> &line : lines) {
1125         WordsWithCharacters &list = line.first;
1126         std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1127     }
1128     lines.shrink_to_fit();
1129 
1130     return lines;
1131 }
1132 
1133 /**
1134  * Calculate Statistical information from the lines we made previously
1135  */
1136 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1137 {
1138     /**
1139      * For the region, defined by line_rects and lines
1140      * 1. Make line statistical analysis to find the line spacing
1141      * 2. Make character statistical analysis to differentiate between
1142      *   word spacing and column spacing.
1143      */
1144 
1145     /**
1146      * Step 0
1147      */
1148     const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1149 
1150     /**
1151      * Step 1
1152      */
1153     QMap<int, int> line_space_stat;
1154     for (int i = 0; i < sortedLines.length(); i++) {
1155         const QRect rectUpper = sortedLines.at(i).second;
1156 
1157         if (i + 1 == sortedLines.length()) {
1158             break;
1159         }
1160         const QRect rectLower = sortedLines.at(i + 1).second;
1161 
1162         int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1163         if (linespace < 0) {
1164             linespace = -linespace;
1165         }
1166 
1167         if (line_space_stat.contains(linespace)) {
1168             line_space_stat[linespace]++;
1169         } else {
1170             line_space_stat[linespace] = 1;
1171         }
1172     }
1173 
1174     *line_spacing = 0;
1175     int weighted_count = 0;
1176     QMapIterator<int, int> iterate_linespace(line_space_stat);
1177 
1178     while (iterate_linespace.hasNext()) {
1179         iterate_linespace.next();
1180         *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1181         weighted_count += iterate_linespace.value();
1182     }
1183     if (*line_spacing != 0) {
1184         *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1185     }
1186 
1187     /**
1188      * Step 2
1189      */
1190     // We would like to use QMap instead of QHash as it will keep the keys sorted
1191     QMap<int, int> hor_space_stat;
1192     QMap<int, int> col_space_stat;
1193     QList<QList<QRect>> space_rects;
1194     QVector<QRect> max_hor_space_rects;
1195 
1196     // Space in every line
1197     for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1198         const WordsWithCharacters list = sortedLine.first;
1199         QList<QRect> line_space_rects;
1200         int maxSpace = 0, minSpace = pageWidth;
1201 
1202         // for every TinyTextEntity element in the line
1203         WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end();
1204         QRect max_area1, max_area2;
1205         // for every line
1206         for (; it != itEnd; it++) {
1207             const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1208             if (it + 1 == itEnd) {
1209                 break;
1210             }
1211 
1212             const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1213             int space = area2.left() - area1.right();
1214 
1215             if (space > maxSpace) {
1216                 max_area1 = area1;
1217                 max_area2 = area2;
1218                 maxSpace = space;
1219             }
1220 
1221             if (space < minSpace && space != 0) {
1222                 minSpace = space;
1223             }
1224 
1225             // if we found a real space, whose length is not zero and also less than the pageWidth
1226             if (space != 0 && space != pageWidth) {
1227                 // increase the count of the space amount
1228                 if (hor_space_stat.contains(space)) {
1229                     hor_space_stat[space]++;
1230                 } else {
1231                     hor_space_stat[space] = 1;
1232                 }
1233 
1234                 int left, right, top, bottom;
1235 
1236                 left = area1.right();
1237                 right = area2.left();
1238 
1239                 top = area2.top() < area1.top() ? area2.top() : area1.top();
1240                 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1241 
1242                 QRect rect(left, top, right - left, bottom - top);
1243                 line_space_rects.append(rect);
1244             }
1245         }
1246 
1247         space_rects.append(line_space_rects);
1248 
1249         if (hor_space_stat.contains(maxSpace)) {
1250             if (hor_space_stat[maxSpace] != 1) {
1251                 hor_space_stat[maxSpace]--;
1252             } else {
1253                 hor_space_stat.remove(maxSpace);
1254             }
1255         }
1256 
1257         if (maxSpace != 0) {
1258             if (col_space_stat.contains(maxSpace)) {
1259                 col_space_stat[maxSpace]++;
1260             } else {
1261                 col_space_stat[maxSpace] = 1;
1262             }
1263 
1264             // store the max rect of each line
1265             const int left = max_area1.right();
1266             const int right = max_area2.left();
1267             const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1268             const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1269 
1270             const QRect rect(left, top, right - left, bottom - top);
1271             max_hor_space_rects.append(rect);
1272         } else {
1273             max_hor_space_rects.append(QRect(0, 0, 0, 0));
1274         }
1275     }
1276 
1277     // All the between word space counts are in hor_space_stat
1278     *word_spacing = 0;
1279     weighted_count = 0;
1280     QMapIterator<int, int> iterate(hor_space_stat);
1281 
1282     while (iterate.hasNext()) {
1283         iterate.next();
1284 
1285         if (iterate.key() > 0) {
1286             *word_spacing += iterate.value() * iterate.key();
1287             weighted_count += iterate.value();
1288         }
1289     }
1290     if (weighted_count) {
1291         *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1292     }
1293 
1294     *col_spacing = 0;
1295     QMapIterator<int, int> iterate_col(col_space_stat);
1296 
1297     while (iterate_col.hasNext()) {
1298         iterate_col.next();
1299         if (iterate_col.value() > *col_spacing) {
1300             *col_spacing = iterate_col.value();
1301         }
1302     }
1303     *col_spacing = col_space_stat.key(*col_spacing);
1304 
1305     // if there is just one line in a region, there is no point in dividing it
1306     if (sortedLines.length() == 1) {
1307         *word_spacing = *col_spacing;
1308     }
1309 }
1310 
1311 /**
1312  * Implements the XY Cut algorithm for textpage segmentation
1313  * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1314  * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1315  */
1316 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1317 {
1318     RegionTextList tree;
1319     QRect contentRect(0, 0, pageWidth, pageHeight);
1320     const RegionText root(wordsWithCharacters, contentRect);
1321 
1322     // start the tree with the root, it is our only region at the start
1323     tree.push_back(root);
1324 
1325     int i = 0;
1326 
1327     // while traversing the tree has not been ended
1328     while (i < tree.length()) {
1329         const RegionText node = tree.at(i);
1330         QRect regionRect = node.area();
1331 
1332         /**
1333          * 1. calculation of projection profiles
1334          */
1335         // allocate the size of proj profiles and initialize with 0
1336         int size_proj_y = node.area().height();
1337         int size_proj_x = node.area().width();
1338         // dynamic memory allocation
1339         QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1340         QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1341 
1342         for (int j = 0; j < size_proj_y; ++j) {
1343             proj_on_yaxis[j] = 0;
1344         }
1345         for (int j = 0; j < size_proj_x; ++j) {
1346             proj_on_xaxis[j] = 0;
1347         }
1348 
1349         const QList<WordWithCharacters> list = node.text();
1350 
1351         // Calculate tcx and tcy locally for each new region
1352         int word_spacing, line_spacing, column_spacing;
1353         calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1354 
1355         const int tcx = word_spacing * 2;
1356         const int tcy = line_spacing * 2;
1357 
1358         int maxX = 0, maxY = 0;
1359         int avgX = 0;
1360         int count;
1361 
1362         // for every text in the region
1363         for (const WordWithCharacters &wwc : list) {
1364             const QRect entRect = wwc.area().geometry(pageWidth, pageHeight);
1365 
1366             // calculate vertical projection profile proj_on_xaxis1
1367             for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1368                 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1369                     proj_on_xaxis[k - regionRect.left()] += entRect.height();
1370                 }
1371             }
1372 
1373             // calculate horizontal projection profile in the same way
1374             for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1375                 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1376                     proj_on_yaxis[k - regionRect.top()] += entRect.width();
1377                 }
1378             }
1379         }
1380 
1381         for (int j = 0; j < size_proj_y; ++j) {
1382             if (proj_on_yaxis[j] > maxY) {
1383                 maxY = proj_on_yaxis[j];
1384             }
1385         }
1386 
1387         avgX = count = 0;
1388         for (int j = 0; j < size_proj_x; ++j) {
1389             if (proj_on_xaxis[j] > maxX) {
1390                 maxX = proj_on_xaxis[j];
1391             }
1392             if (proj_on_xaxis[j]) {
1393                 count++;
1394                 avgX += proj_on_xaxis[j];
1395             }
1396         }
1397         if (count) {
1398             avgX /= count;
1399         }
1400 
1401         /**
1402          * 2. Cleanup Boundary White Spaces and removal of noise
1403          */
1404         int xbegin = 0, xend = size_proj_x - 1;
1405         int ybegin = 0, yend = size_proj_y - 1;
1406         while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1407             xbegin++;
1408         }
1409         while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1410             xend--;
1411         }
1412         while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1413             ybegin++;
1414         }
1415         while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1416             yend--;
1417         }
1418 
1419         // update the regionRect
1420         int old_left = regionRect.left(), old_top = regionRect.top();
1421         regionRect.setLeft(old_left + xbegin);
1422         regionRect.setRight(old_left + xend);
1423         regionRect.setTop(old_top + ybegin);
1424         regionRect.setBottom(old_top + yend);
1425 
1426         int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1427         for (int j = 0; j < size_proj_x; ++j) {
1428             proj_on_xaxis[j] -= tnx;
1429         }
1430         for (int j = 0; j < size_proj_y; ++j) {
1431             proj_on_yaxis[j] -= tny;
1432         }
1433 
1434         /**
1435          * 3. Find the Widest gap
1436          */
1437         int gap_hor = -1, pos_hor = -1;
1438         int begin = -1, end = -1;
1439 
1440         // find all hor_gaps and find the maximum between them
1441         for (int j = 1; j < size_proj_y; ++j) {
1442             // transition from white to black
1443             if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1444                 end = j;
1445             }
1446 
1447             // transition from black to white
1448             if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1449                 begin = j;
1450             }
1451 
1452             if (begin > 0 && end > 0 && end - begin > gap_hor) {
1453                 gap_hor = end - begin;
1454                 pos_hor = (end + begin) / 2;
1455                 begin = -1;
1456                 end = -1;
1457             }
1458         }
1459 
1460         begin = -1, end = -1;
1461         int gap_ver = -1, pos_ver = -1;
1462 
1463         // find all the ver_gaps and find the maximum between them
1464         for (int j = 1; j < size_proj_x; ++j) {
1465             // transition from white to black
1466             if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1467                 end = j;
1468             }
1469 
1470             // transition from black to white
1471             if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1472                 begin = j;
1473             }
1474 
1475             if (begin > 0 && end > 0 && end - begin > gap_ver) {
1476                 gap_ver = end - begin;
1477                 pos_ver = (end + begin) / 2;
1478                 begin = -1;
1479                 end = -1;
1480             }
1481         }
1482 
1483         int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1484         int gap_x = gap_ver, gap_y = gap_hor;
1485 
1486         /**
1487          * 4. Cut the region and make nodes (left,right) or (up,down)
1488          */
1489         bool cut_hor = false, cut_ver = false;
1490 
1491         // For horizontal cut
1492         const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1493         const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1494         const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1495 
1496         // For vertical Cut
1497         const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1498         const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1499         const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1500 
1501         if (gap_y >= gap_x && gap_y >= tcy) {
1502             cut_hor = true;
1503         } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1504             cut_ver = true;
1505         } else if (gap_x >= gap_y && gap_x >= tcx) {
1506             cut_ver = true;
1507         } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1508             cut_hor = true;
1509         } else {
1510             // no cut possible
1511             // we can now update the node rectangle with the shrinked rectangle
1512             RegionText tmpNode = tree.at(i);
1513             tmpNode.setArea(regionRect);
1514             tree.replace(i, tmpNode);
1515             i++;
1516             continue;
1517         }
1518 
1519         WordsWithCharacters list1, list2;
1520 
1521         // horizontal cut, topRect and bottomRect
1522         if (cut_hor) {
1523             for (const WordWithCharacters &word : list) {
1524                 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1525 
1526                 if (topRect.intersects(wordRect)) {
1527                     list1.append(word);
1528                 } else {
1529                     list2.append(word);
1530                 }
1531             }
1532 
1533             RegionText node1(list1, topRect);
1534             RegionText node2(list2, bottomRect);
1535 
1536             tree.replace(i, node1);
1537             tree.insert(i + 1, node2);
1538         }
1539 
1540         // vertical cut, leftRect and rightRect
1541         else if (cut_ver) {
1542             for (const WordWithCharacters &word : list) {
1543                 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1544 
1545                 if (leftRect.intersects(wordRect)) {
1546                     list1.append(word);
1547                 } else {
1548                     list2.append(word);
1549                 }
1550             }
1551 
1552             RegionText node1(list1, leftRect);
1553             RegionText node2(list2, rightRect);
1554 
1555             tree.replace(i, node1);
1556             tree.insert(i + 1, node2);
1557         }
1558     }
1559 
1560     tree.shrink_to_fit();
1561     return tree;
1562 }
1563 
1564 /**
1565  * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1566  */
1567 TextEntity::List addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1568 {
1569     /**
1570      * 1. Call makeAndSortLines before adding spaces in between words in a line
1571      * 2. Now add spaces between every two words in a line
1572      * 3. Finally, extract all the space separated texts from each region and return it
1573      */
1574 
1575     TextEntity::List res;
1576     // Only change the texts under RegionTexts, not the area
1577     for (const RegionText &tmpRegion : std::as_const(tree)) {
1578         // Step 01
1579         QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1580         int counter = 0;
1581 
1582         // Step 02
1583         for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1584             WordsWithCharacters &list = sortedLine.first;
1585             for (int k = 0; k < list.length(); k++) {
1586                 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1587                 if (k + 1 >= list.length()) {
1588                     break;
1589                 }
1590 
1591                 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1592                 const int space = area2.left() - area1.right();
1593 
1594                 if (space != 0) {
1595                     // Make a TinyTextEntity of string space and push it between it and it+1
1596                     const int left = area1.right();
1597                     const int right = area2.left();
1598                     const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1599                     const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1600 
1601                     const QString spaceStr(QStringLiteral(" "));
1602                     const QRect rect(QPoint(left, top), QPoint(right, bottom));
1603                     const NormalizedRect entRect(rect, pageWidth, pageHeight);
1604                     TextEntity ent1 = TextEntity(spaceStr, entRect);
1605                     WordWithCharacters word(ent1, QList<TextEntity>() << ent1);
1606 
1607                     list.insert(k + 1, word);
1608 
1609                     // Skip the space
1610                     k++;
1611                 }
1612             }
1613             counter += list.length();
1614         }
1615         res.reserve(res.length() + counter);
1616 
1617         for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) {
1618             for (const WordWithCharacters &word : sortedLine.first) {
1619                 res += word.characters;
1620             }
1621         }
1622     }
1623 
1624     // Step 03
1625     res.shrink_to_fit();
1626     return res;
1627 }
1628 
1629 /**
1630  * Correct the textOrder, all layout recognition works here
1631  */
1632 void TextPagePrivate::correctTextOrder()
1633 {
1634     // m_page->width() and m_page->height() are in pixels at
1635     // 100% zoom level, and thus depend on display DPI.
1636     // To avoid Okular failing on lowDPI displays,
1637     // we scale pageWidth and pageHeight so their sum equals 2000.
1638     const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1639     const int pageWidth = (int)(scalingFactor * m_page->width());
1640     const int pageHeight = (int)(scalingFactor * m_page->height());
1641 
1642     TextEntity::List characters = m_words;
1643 
1644     /**
1645      * Remove spaces from the text
1646      */
1647     removeSpace(&characters);
1648 
1649     /**
1650      * Construct words from characters
1651      */
1652     const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1653 
1654     /**
1655      * Make a XY Cut tree for segmentation of the texts
1656      */
1657     RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1658 
1659     /**
1660      * Add spaces to the word
1661      */
1662     const auto listOfCharacters = addNecessarySpace(std::move(tree), pageWidth, pageHeight);
1663 
1664     setWordList(listOfCharacters);
1665 }
1666 
1667 TextEntity::List TextPage::words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
1668 {
1669     if (area && area->isNull()) {
1670         return TextEntity::List();
1671     }
1672 
1673     TextEntity::List ret;
1674     if (area) {
1675         for (const TextEntity &te : std::as_const(d->m_words)) {
1676             if (b == AnyPixelTextAreaInclusionBehaviour) {
1677                 if (area->intersects(te.area())) {
1678                     ret.append(te);
1679                 }
1680             } else {
1681                 const NormalizedPoint center = te.area().center();
1682                 if (area->contains(center.x, center.y)) {
1683                     ret.append(te);
1684                 }
1685             }
1686         }
1687     } else {
1688         for (const TextEntity &te : std::as_const(d->m_words)) {
1689             ret.append(te);
1690         }
1691     }
1692     return ret;
1693 }
1694 
1695 RegularAreaRect *TextPage::wordAt(const NormalizedPoint &p, QString *word) const
1696 {
1697     TextEntity::List::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1698     TextEntity::List::ConstIterator it = itBegin;
1699     TextEntity::List::ConstIterator posIt = itEnd;
1700     for (; it != itEnd; ++it) {
1701         if (it->area().contains(p.x, p.y)) {
1702             posIt = it;
1703             break;
1704         }
1705     }
1706     if (posIt != itEnd) {
1707         if (posIt->text().simplified().isEmpty()) {
1708             return nullptr;
1709         }
1710         // Find the first TinyTextEntity of the word
1711         while (posIt != itBegin) {
1712             --posIt;
1713             const QString itText = posIt->text();
1714             if (itText.right(1).at(0).isSpace()) {
1715                 if (itText.endsWith(QLatin1String("-\n"))) {
1716                     // Is an hyphenated word
1717                     // continue searching the start of the word back
1718                     continue;
1719                 }
1720 
1721                 if (itText == QLatin1String("\n") && posIt != itBegin) {
1722                     --posIt;
1723                     if (posIt->text().endsWith(QLatin1String("-"))) {
1724                         // Is an hyphenated word
1725                         // continue searching the start of the word back
1726                         continue;
1727                     }
1728                     ++posIt;
1729                 }
1730 
1731                 ++posIt;
1732                 break;
1733             }
1734         }
1735         RegularAreaRect *ret = new RegularAreaRect();
1736         QString foundWord;
1737         for (; posIt != itEnd; ++posIt) {
1738             const QString itText = posIt->text();
1739             if (itText.simplified().isEmpty()) {
1740                 break;
1741             }
1742 
1743             ret->appendShape(posIt->area());
1744             foundWord += posIt->text();
1745             if (itText.right(1).at(0).isSpace()) {
1746                 if (!foundWord.endsWith(QLatin1String("-\n"))) {
1747                     break;
1748                 }
1749             }
1750         }
1751 
1752         if (word) {
1753             *word = foundWord;
1754         }
1755         return ret;
1756     } else {
1757         return nullptr;
1758     }
1759 }