File indexing completed on 2024-05-12 08:32:57

0001 /*
0002     SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
0003 
0004     SPDX-License-Identifier: GPL-2.0-or-later
0005 */
0006 
0007 #include "textpage.h"
0008 #include "textpage_p.h"
0009 
0010 #include <QDebug>
0011 
0012 #include "area.h"
0013 #include "debug_p.h"
0014 #include "misc.h"
0015 #include "page.h"
0016 #include "page_p.h"
0017 
0018 #include <cstring>
0019 
0020 #include <QVarLengthArray>
0021 #include <QtAlgorithms>
0022 
0023 using namespace Okular;
0024 
0025 class SearchPoint
0026 {
0027 public:
0028     SearchPoint()
0029         : offset_begin(-1)
0030         , offset_end(-1)
0031     {
0032     }
0033 
0034     /** The TinyTextEntity containing the first character of the match. */
0035     TextList::ConstIterator it_begin;
0036 
0037     /** The TinyTextEntity containing the last character of the match. */
0038     TextList::ConstIterator it_end;
0039 
0040     /** The index of the first character of the match in (*it_begin)->text().
0041      *  Satisfies 0 <= offset_begin < (*it_begin)->text().length().
0042      */
0043     int offset_begin;
0044 
0045     /** One plus the index of the last character of the match in (*it_end)->text().
0046      *  Satisfies 0 < offset_end <= (*it_end)->text().length().
0047      */
0048     int offset_end;
0049 };
0050 
0051 /* text comparison functions */
0052 
0053 static bool CaseInsensitiveCmpFn(QStringView from, QStringView to)
0054 {
0055 #ifdef DEBUG_TEXTPAGE
0056     qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
0057 #endif
0058     return from.compare(to, Qt::CaseInsensitive) == 0;
0059 }
0060 
0061 static bool CaseSensitiveCmpFn(QStringView from, QStringView to)
0062 {
0063 #ifdef DEBUG_TEXTPAGE
0064     qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
0065 #endif
0066     return from.compare(to, Qt::CaseSensitive) == 0;
0067 }
0068 
0069 /**
0070  * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
0071  * overlap within @p threshold percent, i. e. iff the ratio of the length of the
0072  * intersection of the segments to the length of the shortest of the two input segments
0073  * is not smaller than the threshold.
0074  */
0075 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
0076 {
0077     // check if one consumes another fully (speed optimization)
0078 
0079     if (left1 <= left2 && right1 >= right2) {
0080         return true;
0081     }
0082 
0083     if (left1 >= left2 && right1 <= right2) {
0084         return true;
0085     }
0086 
0087     // check if there is overlap above threshold
0088     if (right2 >= left1 && right1 >= left2) {
0089         double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
0090 
0091         double length1 = right1 - left1, length2 = right2 - left2;
0092 
0093         return overlap * 100 >= threshold * qMin(length1, length2);
0094     }
0095 
0096     return false;
0097 }
0098 
0099 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
0100 {
0101     return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
0102 }
0103 
0104 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
0105 {
0106     return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
0107 }
0108 
0109 /*
0110   Rationale behind TinyTextEntity:
0111 
0112   instead of storing directly a QString for the text of an entity,
0113   we store the UTF-16 data and their length. This way, we save about
0114   4 int's wrt a QString, and we can create a new string from that
0115   raw data (that's the only penalty of that).
0116   Even better, if the string we need to store has at most
0117   MaxStaticChars characters, then we store those in place of the QChar*
0118   that would be used (with new[] + free[]) for the data.
0119  */
0120 class TinyTextEntity
0121 {
0122     static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
0123 
0124 public:
0125     TinyTextEntity(const QString &text, const NormalizedRect &rect)
0126         : area(rect)
0127     {
0128         Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
0129         Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
0130         length = text.length();
0131         switch (length) {
0132 #if QT_POINTER_SIZE >= 8
0133         case 4:
0134             d.qc[3] = text.at(3).unicode();
0135             // fall through
0136         case 3:
0137             d.qc[2] = text.at(2).unicode();
0138 #endif
0139             // fall through
0140         case 2:
0141             d.qc[1] = text.at(1).unicode();
0142             // fall through
0143         case 1:
0144             d.qc[0] = text.at(0).unicode();
0145             break;
0146         default:
0147             d.data = new QChar[length];
0148             std::memcpy(d.data, text.constData(), length * sizeof(QChar));
0149         }
0150     }
0151 
0152     ~TinyTextEntity()
0153     {
0154         if (length > MaxStaticChars) {
0155             delete[] d.data;
0156         }
0157     }
0158 
0159     inline QString text() const
0160     {
0161         return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
0162     }
0163 
0164     inline NormalizedRect transformedArea(const QTransform &matrix) const
0165     {
0166         NormalizedRect transformed_area = area;
0167         transformed_area.transform(matrix);
0168         return transformed_area;
0169     }
0170 
0171     NormalizedRect area;
0172 
0173 private:
0174     Q_DISABLE_COPY(TinyTextEntity)
0175 
0176     union {
0177         QChar *data;
0178         ushort qc[MaxStaticChars];
0179     } d;
0180     int length;
0181 };
0182 
0183 TextEntity::TextEntity(const QString &text, NormalizedRect *area)
0184     : m_text(text)
0185     , m_area(area)
0186     , d(nullptr)
0187 {
0188 }
0189 
0190 TextEntity::~TextEntity()
0191 {
0192     delete m_area;
0193 }
0194 
0195 QString TextEntity::text() const
0196 {
0197     return m_text;
0198 }
0199 
0200 NormalizedRect *TextEntity::area() const
0201 {
0202     return m_area;
0203 }
0204 
0205 NormalizedRect TextEntity::transformedArea(const QTransform &matrix) const
0206 {
0207     NormalizedRect transformed_area = *m_area;
0208     transformed_area.transform(matrix);
0209     return transformed_area;
0210 }
0211 
0212 TextPagePrivate::TextPagePrivate()
0213     : m_page(nullptr)
0214 {
0215 }
0216 
0217 TextPagePrivate::~TextPagePrivate()
0218 {
0219     qDeleteAll(m_searchPoints);
0220     qDeleteAll(m_words);
0221 }
0222 
0223 TextPage::TextPage()
0224     : d(new TextPagePrivate())
0225 {
0226 }
0227 
0228 TextPage::TextPage(const TextEntity::List &words)
0229     : d(new TextPagePrivate())
0230 {
0231     TextEntity::List::ConstIterator it = words.constBegin(), itEnd = words.constEnd();
0232     for (; it != itEnd; ++it) {
0233         TextEntity *e = *it;
0234         if (!e->text().isEmpty()) {
0235             d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
0236         }
0237         delete e;
0238     }
0239 }
0240 
0241 TextPage::~TextPage()
0242 {
0243     delete d;
0244 }
0245 
0246 void TextPage::append(const QString &text, NormalizedRect *area)
0247 {
0248     if (!text.isEmpty()) {
0249         if (!d->m_words.isEmpty()) {
0250             TinyTextEntity *lastEntity = d->m_words.last();
0251             const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC);
0252             if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
0253                 // If this happens it means that the new text + old one have combined, for example A and ◌̊  form Å
0254                 NormalizedRect newArea = *area | lastEntity->area;
0255                 delete area;
0256                 delete lastEntity;
0257                 d->m_words.removeLast();
0258                 d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
0259                 return;
0260             }
0261         }
0262 
0263         d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
0264     }
0265     delete area;
0266 }
0267 
0268 struct WordWithCharacters {
0269     WordWithCharacters(TinyTextEntity *w, const TextList &c)
0270         : word(w)
0271         , characters(c)
0272     {
0273     }
0274 
0275     inline QString text() const
0276     {
0277         return word->text();
0278     }
0279 
0280     inline const NormalizedRect &area() const
0281     {
0282         return word->area;
0283     }
0284 
0285     TinyTextEntity *word;
0286     TextList characters;
0287 };
0288 typedef QList<WordWithCharacters> WordsWithCharacters;
0289 
0290 /**
0291  * We will divide the whole page in some regions depending on the horizontal and
0292  * vertical spacing among different regions. Each region will have an area and an
0293  * associated WordsWithCharacters in sorted order.
0294  */
0295 class RegionText
0296 {
0297 public:
0298     RegionText() {};
0299 
0300     RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
0301         : m_region_wordWithCharacters(wordsWithCharacters)
0302         , m_area(area)
0303     {
0304     }
0305 
0306     inline QString string() const
0307     {
0308         QString res;
0309         for (const WordWithCharacters &word : m_region_wordWithCharacters) {
0310             res += word.text();
0311         }
0312         return res;
0313     }
0314 
0315     inline WordsWithCharacters text() const
0316     {
0317         return m_region_wordWithCharacters;
0318     }
0319 
0320     inline QRect area() const
0321     {
0322         return m_area;
0323     }
0324 
0325     inline void setArea(const QRect area)
0326     {
0327         m_area = area;
0328     }
0329 
0330     inline void setText(const WordsWithCharacters &wordsWithCharacters)
0331     {
0332         m_region_wordWithCharacters = wordsWithCharacters;
0333     }
0334 
0335 private:
0336     WordsWithCharacters m_region_wordWithCharacters;
0337     QRect m_area;
0338 };
0339 
0340 RegularAreaRect *TextPage::textArea(TextSelection *sel) const
0341 {
0342     if (d->m_words.isEmpty()) {
0343         return new RegularAreaRect();
0344     }
0345 
0346     /**
0347         It works like this:
0348         There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
0349         rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
0350         points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
0351         for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
0352         of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
0353         1. (rx,ry)x(1,ty)
0354         2. (0,ty)x(1,uy)
0355         3. (0,uy)x(vx,vy)
0356 
0357         To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
0358         or that has a left border >= cx and bottom border >= cy.
0359     */
0360     RegularAreaRect *ret = new RegularAreaRect;
0361 
0362     PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
0363     const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
0364     const double scaleX = d->m_page->width();
0365     const double scaleY = d->m_page->height();
0366 
0367     NormalizedPoint startC = sel->start();
0368     NormalizedPoint endC = sel->end();
0369     NormalizedPoint temp;
0370 
0371     // if startPoint is right to endPoint swap them
0372     if (startC.x > endC.x) {
0373         temp = startC;
0374         startC = endC;
0375         endC = temp;
0376     }
0377 
0378     // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
0379     const NormalizedRect boundingRect = d->m_page->boundingBox();
0380     const QRect content = boundingRect.geometry(scaleX, scaleY);
0381     const double minX = content.left();
0382     const double maxX = content.right();
0383     const double minY = content.top();
0384     const double maxY = content.bottom();
0385 
0386     /**
0387      * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
0388      * the endRectangle. We have four cases:
0389      *
0390      * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
0391      * and endPoint are outof the bounding rect (do not intersect)
0392      *
0393      * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
0394      *
0395      * Case 2(a): find the rectangle which contains start and endpoint and having some
0396      * TextEntity
0397      *
0398      * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
0399      * TextEntity within the rect made by startPoint and endPoint
0400      *
0401      * Case 3: Now, we may have two type of selection.
0402      * 1. startpoint is left-top of start_end and endpoint is right-bottom
0403      * 2. startpoint is left-bottom of start_end and endpoint is top-right
0404      *
0405      * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
0406      * text within them. so, we need to search for the best suitable textposition for start and end.
0407      *
0408      * Case 3(a): We search the nearest rectangle consisting of some
0409      * TinyTextEntity right to or bottom of the startPoint for selection 01.
0410      * And, for selection 02, we have to search for right and top
0411      *
0412      * Case 3(b): For endpoint, we have to find the point top of or left to
0413      * endpoint if we have selection 01.
0414      * Otherwise, the search will be left and bottom
0415      */
0416 
0417     // we know that startC.x > endC.x, we need to decide which is top and which is bottom
0418     const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
0419 
0420     // Case 1(a)
0421     if (!boundingRect.intersects(start_end)) {
0422         return ret;
0423     } else {
0424         // case 1(b)
0425         /**
0426             note that, after swapping of start and end, we know that,
0427             start is always left to end. but, we cannot say start is
0428             positioned upper than end.
0429         **/
0430         // if start is left to content rect take it to content rect boundary
0431         if (startC.x * scaleX < minX) {
0432             startC.x = minX / scaleX;
0433         }
0434         if (endC.x * scaleX > maxX) {
0435             endC.x = maxX / scaleX;
0436         }
0437 
0438         // if start is top to end (selection type 01)
0439         if (startC.y * scaleY < minY) {
0440             startC.y = minY / scaleY;
0441         }
0442         if (endC.y * scaleY > maxY) {
0443             endC.y = maxY / scaleY;
0444         }
0445 
0446         // if start is bottom to end (selection type 02)
0447         if (startC.y * scaleY > maxY) {
0448             startC.y = maxY / scaleY;
0449         }
0450         if (endC.y * scaleY < minY) {
0451             endC.y = minY / scaleY;
0452         }
0453     }
0454 
0455     TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
0456     TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
0457     const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
0458 
0459     NormalizedRect tmp;
0460     // case 2(a)
0461     for (; it != itEnd; ++it) {
0462         tmp = (*it)->area;
0463         if (tmp.contains(startC.x, startC.y)) {
0464             start = it;
0465         }
0466         if (tmp.contains(endC.x, endC.y)) {
0467             end = it;
0468         }
0469     }
0470 
0471     // case 2(b)
0472     it = tmpIt;
0473     if (start == it && end == itEnd) {
0474         for (; it != itEnd; ++it) {
0475             // is there any text rectangle within the start_end rect
0476             tmp = (*it)->area;
0477             if (start_end.intersects(tmp)) {
0478                 break;
0479             }
0480         }
0481 
0482         // we have searched every text entities, but none is within the rectangle created by start and end
0483         // so, no selection should be done
0484         if (it == itEnd) {
0485             return ret;
0486         }
0487     }
0488     it = tmpIt;
0489     bool selection_two_start = false;
0490 
0491     // case 3.a
0492     if (start == it) {
0493         NormalizedRect rect;
0494 
0495         // selection type 01
0496         if (startC.y <= endC.y) {
0497             for (; it != itEnd; ++it) {
0498                 rect = (*it)->area;
0499                 bool flagV = !rect.isBottom(startC);
0500 
0501                 if (flagV && rect.isRight(startC)) {
0502                     start = it;
0503                     break;
0504                 }
0505             }
0506         }
0507 
0508         // selection type 02
0509         else {
0510             selection_two_start = true;
0511             int distance = scaleX + scaleY + 100;
0512 
0513             for (; it != itEnd; ++it) {
0514                 rect = (*it)->area;
0515 
0516                 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
0517                     QRect entRect = rect.geometry(scaleX, scaleY);
0518                     int xdist, ydist;
0519                     xdist = entRect.center().x() - startC.x * scaleX;
0520                     ydist = entRect.center().y() - startC.y * scaleY;
0521 
0522                     // make them positive
0523                     if (xdist < 0) {
0524                         xdist = -xdist;
0525                     }
0526                     if (ydist < 0) {
0527                         ydist = -ydist;
0528                     }
0529 
0530                     if ((xdist + ydist) < distance) {
0531                         distance = xdist + ydist;
0532                         start = it;
0533                     }
0534                 }
0535             }
0536         }
0537     }
0538 
0539     // case 3.b
0540     if (end == itEnd) {
0541         it = tmpIt;
0542         itEnd = itEnd - 1;
0543 
0544         NormalizedRect rect;
0545 
0546         if (startC.y <= endC.y) {
0547             for (; itEnd >= it; itEnd--) {
0548                 rect = (*itEnd)->area;
0549                 bool flagV = !rect.isTop(endC);
0550 
0551                 if (flagV && rect.isLeft(endC)) {
0552                     end = itEnd;
0553                     break;
0554                 }
0555             }
0556         }
0557 
0558         else {
0559             int distance = scaleX + scaleY + 100;
0560             for (; itEnd >= it; itEnd--) {
0561                 rect = (*itEnd)->area;
0562 
0563                 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
0564                     QRect entRect = rect.geometry(scaleX, scaleY);
0565                     int xdist, ydist;
0566                     xdist = entRect.center().x() - endC.x * scaleX;
0567                     ydist = entRect.center().y() - endC.y * scaleY;
0568 
0569                     // make them positive
0570                     if (xdist < 0) {
0571                         xdist = -xdist;
0572                     }
0573                     if (ydist < 0) {
0574                         ydist = -ydist;
0575                     }
0576 
0577                     if ((xdist + ydist) < distance) {
0578                         distance = xdist + ydist;
0579                         end = itEnd;
0580                     }
0581                 }
0582             }
0583         }
0584     }
0585 
0586     /* if start and end in selection 02 are in the same column, and we
0587      start at an empty space we have to remove the selection of last
0588      character
0589     */
0590     if (selection_two_start) {
0591         if (start > end) {
0592             start = start - 1;
0593         }
0594     }
0595 
0596     // if start is less than end swap them
0597     if (start > end) {
0598         it = start;
0599         start = end;
0600         end = it;
0601     }
0602 
0603     // removes the possibility of crash, in case none of 1 to 3 is true
0604     if (end == d->m_words.constEnd()) {
0605         end--;
0606     }
0607 
0608     for (; start <= end; start++) {
0609         ret->appendShape((*start)->transformedArea(matrix), side);
0610     }
0611 
0612     return ret;
0613 }
0614 
0615 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
0616 {
0617     SearchDirection dir = direct;
0618     // invalid search request
0619     if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull())) {
0620         return nullptr;
0621     }
0622     TextList::ConstIterator start;
0623     int start_offset = 0;
0624     TextList::ConstIterator end;
0625     const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
0626     if (sIt == d->m_searchPoints.constEnd()) {
0627         // if no previous run of this search is found, then set it to start
0628         // from the beginning (respecting the search direction)
0629         if (dir == NextResult) {
0630             dir = FromTop;
0631         } else if (dir == PreviousResult) {
0632             dir = FromBottom;
0633         }
0634     }
0635     bool forward = true;
0636     switch (dir) {
0637     case FromTop:
0638         start = d->m_words.constBegin();
0639         start_offset = 0;
0640         end = d->m_words.constEnd();
0641         break;
0642     case FromBottom:
0643         start = d->m_words.constEnd();
0644         start_offset = 0;
0645         end = d->m_words.constBegin();
0646         forward = false;
0647         break;
0648     case NextResult:
0649         start = (*sIt)->it_end;
0650         start_offset = (*sIt)->offset_end;
0651         end = d->m_words.constEnd();
0652         break;
0653     case PreviousResult:
0654         start = (*sIt)->it_begin;
0655         start_offset = (*sIt)->offset_begin;
0656         end = d->m_words.constBegin();
0657         forward = false;
0658         break;
0659     };
0660     RegularAreaRect *ret = nullptr;
0661     const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
0662     if (forward) {
0663         ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
0664     } else {
0665         ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
0666     }
0667     return ret;
0668 }
0669 
0670 // hyphenated '-' must be at the end of a word, so hyphenation means
0671 // we have a '-' just followed by a '\n' character
0672 // check if the string contains a '-' character
0673 // if the '-' is the last entry
0674 static int stringLengthAdaptedWithHyphen(const QString &str, TextList::ConstIterator it, TextList::ConstIterator textListEnd)
0675 {
0676     const int len = str.length();
0677 
0678     // hyphenated '-' must be at the end of a word, so hyphenation means
0679     // we have a '-' just followed by a '\n' character
0680     // check if the string contains a '-' character
0681     // if the '-' is the last entry
0682     if (str.endsWith(QLatin1Char('-'))) {
0683         // validity chek of it + 1
0684         if ((it + 1) != textListEnd) {
0685             // 1. if the next character is '\n'
0686             const QString &lookahedStr = (*(it + 1))->text();
0687             if (lookahedStr.startsWith(QLatin1Char('\n'))) {
0688                 return len - 1;
0689             }
0690 
0691             // 2. if the next word is in a different line or not
0692             const NormalizedRect &hyphenArea = (*it)->area;
0693             const NormalizedRect &lookaheadArea = (*(it + 1))->area;
0694 
0695             // lookahead to check whether both the '-' rect and next character rect overlap
0696             if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
0697                 return len - 1;
0698             }
0699         }
0700     }
0701     // else if it is the second last entry - for example in pdf format
0702     else if (str.endsWith(QLatin1String("-\n"))) {
0703         return len - 2;
0704     }
0705 
0706     return len;
0707 }
0708 
0709 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
0710 {
0711     PagePrivate *pagePrivate = PagePrivate::get(m_page);
0712     const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
0713     RegularAreaRect *ret = new RegularAreaRect;
0714 
0715     for (TextList::ConstIterator it = sp->it_begin;; it++) {
0716         const TinyTextEntity *curEntity = *it;
0717         ret->append(curEntity->transformedArea(matrix));
0718 
0719         if (it == sp->it_end) {
0720             break;
0721         }
0722     }
0723 
0724     ret->simplify();
0725     return ret;
0726 }
0727 
0728 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, TextList::ConstIterator start, int start_offset, TextList::ConstIterator end)
0729 {
0730     // normalize query search all unicode (including glyphs)
0731     const QString query = _query.normalized(QString::NormalizationForm_KC);
0732 
0733     // j is the current position in our query
0734     // queryLeft is the length of the query we have left to match
0735     int j = 0, queryLeft = query.length();
0736 
0737     TextList::ConstIterator it = start;
0738     int offset = start_offset;
0739 
0740     TextList::ConstIterator it_begin = TextList::ConstIterator();
0741     int offset_begin = 0; // dummy initial value to suppress compiler warnings
0742 
0743     while (it != end) {
0744         const TinyTextEntity *curEntity = *it;
0745         const QString &str = curEntity->text();
0746         const int strLen = str.length();
0747         const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
0748         // adjustedLen <= strLen
0749 
0750         if (offset >= strLen) {
0751             it++;
0752             offset = 0;
0753             continue;
0754         }
0755 
0756         if (it_begin == TextList::ConstIterator()) {
0757             it_begin = it;
0758             offset_begin = offset;
0759         }
0760 
0761         // Let the user write the hyphen or not when searching for text
0762         int matchedLen = -1;
0763         for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
0764             // we have equal (or less than) area of the query left as the length of the current
0765             // entity
0766             const int min = qMin(queryLeft, matchingLen - offset);
0767             if (comparer(QStringView {str}.mid(offset, min), QStringView {query}.mid(j, min))) {
0768                 matchedLen = min;
0769                 break;
0770             }
0771         }
0772 
0773         if (matchedLen == -1) {
0774             // we have not matched
0775             // this means we do not have a complete match
0776             // we need to get back to query start
0777             // and continue the search from this place
0778 #ifdef DEBUG_TEXTPAGE
0779             qCDebug(OkularCoreDebug) << "\tnot matched";
0780 #endif
0781             j = 0;
0782             queryLeft = query.length();
0783             it = it_begin;
0784             offset = offset_begin + 1;
0785             it_begin = TextList::ConstIterator();
0786         } else {
0787             // we have a match
0788             // move the current position in the query
0789             // to the position after the length of this string
0790             // we matched
0791             // subtract the length of the current entity from
0792             // the left length of the query
0793 #ifdef DEBUG_TEXTPAGE
0794             qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
0795 #endif
0796             j += matchedLen;
0797             queryLeft -= matchedLen;
0798 
0799             if (queryLeft == 0) {
0800                 // save or update the search point for the current searchID
0801                 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0802                 if (sIt == m_searchPoints.end()) {
0803                     sIt = m_searchPoints.insert(searchID, new SearchPoint);
0804                 }
0805                 SearchPoint *sp = *sIt;
0806                 sp->it_begin = it_begin;
0807                 sp->it_end = it;
0808                 sp->offset_begin = offset_begin;
0809                 sp->offset_end = offset + matchedLen;
0810                 return searchPointToArea(sp);
0811             }
0812 
0813             it++;
0814             offset = 0;
0815         }
0816     }
0817     // end of loop - it means that we've ended the textentities
0818 
0819     const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0820     if (sIt != m_searchPoints.end()) {
0821         SearchPoint *sp = *sIt;
0822         m_searchPoints.erase(sIt);
0823         delete sp;
0824     }
0825     return nullptr;
0826 }
0827 
0828 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, TextList::ConstIterator start, int start_offset, TextList::ConstIterator end)
0829 {
0830     // normalize query to search all unicode (including glyphs)
0831     const QString query = _query.normalized(QString::NormalizationForm_KC);
0832 
0833     // j is the current position in our query
0834     // len is the length of the string in TextEntity
0835     // queryLeft is the length of the query we have left
0836     int j = query.length(), queryLeft = query.length();
0837 
0838     TextList::ConstIterator it = start;
0839     int offset = start_offset;
0840 
0841     TextList::ConstIterator it_begin = TextList::ConstIterator();
0842     int offset_begin = 0; // dummy initial value to suppress compiler warnings
0843 
0844     while (true) {
0845         if (offset <= 0) {
0846             if (it == end) {
0847                 break;
0848             }
0849             it--;
0850         }
0851 
0852         const TinyTextEntity *curEntity = *it;
0853         const QString &str = curEntity->text();
0854         const int strLen = str.length();
0855         const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
0856         // adjustedLen <= strLen
0857 
0858         if (offset <= 0) {
0859             offset = strLen;
0860         }
0861 
0862         if (it_begin == TextList::ConstIterator()) {
0863             it_begin = it;
0864             offset_begin = offset;
0865         }
0866 
0867         // Let the user write the hyphen or not when searching for text
0868         int matchedLen = -1;
0869         // we have equal (or less than) area of the query left as the length of the current
0870         // entity
0871         for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
0872             const int hyphenOffset = (strLen - matchingLen);
0873             const int min = qMin(queryLeft + hyphenOffset, offset);
0874             if (comparer(QStringView {str}.mid(offset - min, min - hyphenOffset), QStringView {query}.mid(j - min + hyphenOffset, min - hyphenOffset))) {
0875                 matchedLen = min - hyphenOffset;
0876                 break;
0877             }
0878         }
0879 
0880         if (matchedLen == -1) {
0881             // we have not matched
0882             // this means we do not have a complete match
0883             // we need to get back to query start
0884             // and continue the search from this place
0885 #ifdef DEBUG_TEXTPAGE
0886             qCDebug(OkularCoreDebug) << "\tnot matched";
0887 #endif
0888 
0889             j = query.length();
0890             queryLeft = query.length();
0891             it = it_begin;
0892             offset = offset_begin - 1;
0893             it_begin = TextList::ConstIterator();
0894         } else {
0895             // we have a match
0896             // move the current position in the query
0897             // to the position after the length of this string
0898             // we matched
0899             // subtract the length of the current entity from
0900             // the left length of the query
0901 #ifdef DEBUG_TEXTPAGE
0902             qCDebug(OkularCoreDebug) << "\tmatched";
0903 #endif
0904             j -= matchedLen;
0905             queryLeft -= matchedLen;
0906 
0907             if (queryLeft == 0) {
0908                 // save or update the search point for the current searchID
0909                 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0910                 if (sIt == m_searchPoints.end()) {
0911                     sIt = m_searchPoints.insert(searchID, new SearchPoint);
0912                 }
0913                 SearchPoint *sp = *sIt;
0914                 sp->it_begin = it;
0915                 sp->it_end = it_begin;
0916                 sp->offset_begin = offset - matchedLen;
0917                 sp->offset_end = offset_begin;
0918                 return searchPointToArea(sp);
0919             }
0920 
0921             offset = 0;
0922         }
0923     }
0924     // end of loop - it means that we've ended the textentities
0925 
0926     const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
0927     if (sIt != m_searchPoints.end()) {
0928         SearchPoint *sp = *sIt;
0929         m_searchPoints.erase(sIt);
0930         delete sp;
0931     }
0932     return nullptr;
0933 }
0934 
0935 QString TextPage::text(const RegularAreaRect *area) const
0936 {
0937     return text(area, AnyPixelTextAreaInclusionBehaviour);
0938 }
0939 
0940 QString TextPage::text(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
0941 {
0942     if (area && area->isNull()) {
0943         return QString();
0944     }
0945 
0946     TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
0947     QString ret;
0948     if (area) {
0949         for (; it != itEnd; ++it) {
0950             if (b == AnyPixelTextAreaInclusionBehaviour) {
0951                 if (area->intersects((*it)->area)) {
0952                     ret += (*it)->text();
0953                 }
0954             } else {
0955                 NormalizedPoint center = (*it)->area.center();
0956                 if (area->contains(center.x, center.y)) {
0957                     ret += (*it)->text();
0958                 }
0959             }
0960         }
0961     } else {
0962         for (; it != itEnd; ++it) {
0963             ret += (*it)->text();
0964         }
0965     }
0966     return ret;
0967 }
0968 
0969 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
0970 {
0971     QRect firstArea = first.area().roundedGeometry(1000, 1000);
0972     QRect secondArea = second.area().roundedGeometry(1000, 1000);
0973 
0974     return firstArea.left() < secondArea.left();
0975 }
0976 
0977 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
0978 {
0979     const QRect firstArea = first.area().roundedGeometry(1000, 1000);
0980     const QRect secondArea = second.area().roundedGeometry(1000, 1000);
0981 
0982     return firstArea.top() < secondArea.top();
0983 }
0984 
0985 /**
0986  * Sets a new world list. Deleting the contents of the old one
0987  */
0988 void TextPagePrivate::setWordList(const TextList &list)
0989 {
0990     qDeleteAll(m_words);
0991     m_words = list;
0992 }
0993 
0994 /**
0995  * Remove all the spaces in between texts. It will make all the generators
0996  * same, whether they save spaces(like pdf) or not(like djvu).
0997  */
0998 static void removeSpace(TextList *words)
0999 {
1000     TextList::Iterator it = words->begin();
1001     const QString str(QLatin1Char(' '));
1002 
1003     while (it != words->end()) {
1004         if ((*it)->text() == str) {
1005             it = words->erase(it);
1006         } else {
1007             ++it;
1008         }
1009     }
1010 }
1011 
1012 /**
1013  * We will read the TinyTextEntity from characters and try to create words from there.
1014  * Note: characters might be already characters for some generators, but we will keep
1015  * the nomenclature characters for the generator produced data. The resulting
1016  * WordsWithCharacters memory has to be managed by the caller, both the
1017  * WordWithCharacters::word and WordWithCharacters::characters contents
1018  */
1019 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1020 {
1021     /**
1022      * We will traverse characters and try to create words from the TinyTextEntities in it.
1023      * We will search TinyTextEntity blocks and merge them until we get a
1024      * space between two consecutive TinyTextEntities. When we get a space
1025      * we can take it as a end of word. Then we store the word as a TinyTextEntity
1026      * and keep it in newList.
1027 
1028      * We create a RegionText named regionWord that contains the word and the characters associated with it and
1029      * a rectangle area of the element in newList.
1030 
1031      */
1032     WordsWithCharacters wordsWithCharacters;
1033 
1034     TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1035     int newLeft, newRight, newTop, newBottom;
1036 
1037     for (; it != itEnd; it++) {
1038         QString textString = (*it)->text();
1039         QString newString;
1040         QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1041         TextList wordCharacters;
1042         tmpIt = it;
1043         int space = 0;
1044 
1045         while (!space) {
1046             if (!textString.isEmpty()) {
1047                 newString.append(textString);
1048 
1049                 // when textString is the start of the word
1050                 if (tmpIt == it) {
1051                     NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1052                     wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1053                 } else {
1054                     NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1055                     wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1056                 }
1057             }
1058 
1059             ++it;
1060 
1061             /*
1062              we must have to put this line before the if condition of it==itEnd
1063              otherwise the last character can be missed
1064              */
1065             if (it == itEnd) {
1066                 break;
1067             }
1068             elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1069             if (!doesConsumeY(elementArea, lineArea, 60)) {
1070                 --it;
1071                 break;
1072             }
1073 
1074             const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1075             const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1076 
1077             space = elementArea.left() - lineArea.right();
1078 
1079             if (space != 0) {
1080                 it--;
1081                 break;
1082             }
1083 
1084             newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1085             newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1086             newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1087             newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1088 
1089             lineArea.setLeft(newLeft);
1090             lineArea.setTop(newTop);
1091             lineArea.setWidth(newRight - newLeft);
1092             lineArea.setHeight(newBottom - newTop);
1093 
1094             textString = (*it)->text();
1095         }
1096 
1097         // if newString is not empty, save it
1098         if (!newString.isEmpty()) {
1099             const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1100             TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1101             wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1102         }
1103 
1104         if (it == itEnd) {
1105             break;
1106         }
1107     }
1108 
1109     return wordsWithCharacters;
1110 }
1111 
1112 /**
1113  * Create Lines from the words and sort them
1114  */
1115 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1116 {
1117     /**
1118      * We cannot assume that the generator will give us texts in the right order.
1119      * We can only assume that we will get texts in the page and their bounding
1120      * rectangle. The texts can be character, word, half-word anything.
1121      * So, we need to:
1122      **
1123      * 1. Sort rectangles/boxes containing texts by y0(top)
1124      * 2. Create textline where there is y overlap between TinyTextEntity 's
1125      * 3. Within each line sort the TinyTextEntity 's by x0(left)
1126      */
1127 
1128     QList<QPair<WordsWithCharacters, QRect>> lines;
1129 
1130     /*
1131      Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1132      not contain same pointers for all the TinyTextEntity.
1133      */
1134     QList<WordWithCharacters> words = wordsTmp;
1135 
1136     // Step 1
1137     std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1138 
1139     // Step 2
1140     QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1141 
1142     // for every non-space texts(characters/words) in the textList
1143     for (; it != itEnd; it++) {
1144         const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1145         bool found = false;
1146 
1147         for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1148             /* the line area which will be expanded
1149                line_rects is only necessary to preserve the topmin and bottommax of all
1150                the texts in the line, left and right is not necessary at all
1151             */
1152             QRect &lineArea = linesI.second;
1153             const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1154             const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1155 
1156             /*
1157                if the new text and the line has y overlapping parts of more than 70%,
1158                the text will be added to this line
1159              */
1160             if (doesConsumeY(elementArea, lineArea, 70)) {
1161                 WordsWithCharacters &line = linesI.first;
1162                 line.append(*it);
1163 
1164                 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1165                 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1166                 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1167                 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1168 
1169                 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1170                 found = true;
1171             }
1172 
1173             if (found) {
1174                 break;
1175             }
1176         }
1177 
1178         /* when we have found a new line create a new TextList containing
1179            only one element and append it to the lines
1180          */
1181         if (!found) {
1182             WordsWithCharacters tmp;
1183             tmp.append((*it));
1184             lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1185         }
1186     }
1187 
1188     // Step 3
1189     for (QPair<WordsWithCharacters, QRect> &line : lines) {
1190         WordsWithCharacters &list = line.first;
1191         std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1192     }
1193 
1194     return lines;
1195 }
1196 
1197 /**
1198  * Calculate Statistical information from the lines we made previously
1199  */
1200 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1201 {
1202     /**
1203      * For the region, defined by line_rects and lines
1204      * 1. Make line statistical analysis to find the line spacing
1205      * 2. Make character statistical analysis to differentiate between
1206      *   word spacing and column spacing.
1207      */
1208 
1209     /**
1210      * Step 0
1211      */
1212     const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1213 
1214     /**
1215      * Step 1
1216      */
1217     QMap<int, int> line_space_stat;
1218     for (int i = 0; i < sortedLines.length(); i++) {
1219         const QRect rectUpper = sortedLines.at(i).second;
1220 
1221         if (i + 1 == sortedLines.length()) {
1222             break;
1223         }
1224         const QRect rectLower = sortedLines.at(i + 1).second;
1225 
1226         int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1227         if (linespace < 0) {
1228             linespace = -linespace;
1229         }
1230 
1231         if (line_space_stat.contains(linespace)) {
1232             line_space_stat[linespace]++;
1233         } else {
1234             line_space_stat[linespace] = 1;
1235         }
1236     }
1237 
1238     *line_spacing = 0;
1239     int weighted_count = 0;
1240     QMapIterator<int, int> iterate_linespace(line_space_stat);
1241 
1242     while (iterate_linespace.hasNext()) {
1243         iterate_linespace.next();
1244         *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1245         weighted_count += iterate_linespace.value();
1246     }
1247     if (*line_spacing != 0) {
1248         *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1249     }
1250 
1251     /**
1252      * Step 2
1253      */
1254     // We would like to use QMap instead of QHash as it will keep the keys sorted
1255     QMap<int, int> hor_space_stat;
1256     QMap<int, int> col_space_stat;
1257     QList<QList<QRect>> space_rects;
1258     QVector<QRect> max_hor_space_rects;
1259 
1260     // Space in every line
1261     for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1262         const WordsWithCharacters list = sortedLine.first;
1263         QList<QRect> line_space_rects;
1264         int maxSpace = 0, minSpace = pageWidth;
1265 
1266         // for every TinyTextEntity element in the line
1267         WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end();
1268         QRect max_area1, max_area2;
1269         // for every line
1270         for (; it != itEnd; it++) {
1271             const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1272             if (it + 1 == itEnd) {
1273                 break;
1274             }
1275 
1276             const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1277             int space = area2.left() - area1.right();
1278 
1279             if (space > maxSpace) {
1280                 max_area1 = area1;
1281                 max_area2 = area2;
1282                 maxSpace = space;
1283             }
1284 
1285             if (space < minSpace && space != 0) {
1286                 minSpace = space;
1287             }
1288 
1289             // if we found a real space, whose length is not zero and also less than the pageWidth
1290             if (space != 0 && space != pageWidth) {
1291                 // increase the count of the space amount
1292                 if (hor_space_stat.contains(space)) {
1293                     hor_space_stat[space]++;
1294                 } else {
1295                     hor_space_stat[space] = 1;
1296                 }
1297 
1298                 int left, right, top, bottom;
1299 
1300                 left = area1.right();
1301                 right = area2.left();
1302 
1303                 top = area2.top() < area1.top() ? area2.top() : area1.top();
1304                 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1305 
1306                 QRect rect(left, top, right - left, bottom - top);
1307                 line_space_rects.append(rect);
1308             }
1309         }
1310 
1311         space_rects.append(line_space_rects);
1312 
1313         if (hor_space_stat.contains(maxSpace)) {
1314             if (hor_space_stat[maxSpace] != 1) {
1315                 hor_space_stat[maxSpace]--;
1316             } else {
1317                 hor_space_stat.remove(maxSpace);
1318             }
1319         }
1320 
1321         if (maxSpace != 0) {
1322             if (col_space_stat.contains(maxSpace)) {
1323                 col_space_stat[maxSpace]++;
1324             } else {
1325                 col_space_stat[maxSpace] = 1;
1326             }
1327 
1328             // store the max rect of each line
1329             const int left = max_area1.right();
1330             const int right = max_area2.left();
1331             const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1332             const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1333 
1334             const QRect rect(left, top, right - left, bottom - top);
1335             max_hor_space_rects.append(rect);
1336         } else {
1337             max_hor_space_rects.append(QRect(0, 0, 0, 0));
1338         }
1339     }
1340 
1341     // All the between word space counts are in hor_space_stat
1342     *word_spacing = 0;
1343     weighted_count = 0;
1344     QMapIterator<int, int> iterate(hor_space_stat);
1345 
1346     while (iterate.hasNext()) {
1347         iterate.next();
1348 
1349         if (iterate.key() > 0) {
1350             *word_spacing += iterate.value() * iterate.key();
1351             weighted_count += iterate.value();
1352         }
1353     }
1354     if (weighted_count) {
1355         *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1356     }
1357 
1358     *col_spacing = 0;
1359     QMapIterator<int, int> iterate_col(col_space_stat);
1360 
1361     while (iterate_col.hasNext()) {
1362         iterate_col.next();
1363         if (iterate_col.value() > *col_spacing) {
1364             *col_spacing = iterate_col.value();
1365         }
1366     }
1367     *col_spacing = col_space_stat.key(*col_spacing);
1368 
1369     // if there is just one line in a region, there is no point in dividing it
1370     if (sortedLines.length() == 1) {
1371         *word_spacing = *col_spacing;
1372     }
1373 }
1374 
1375 /**
1376  * Implements the XY Cut algorithm for textpage segmentation
1377  * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1378  * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1379  */
1380 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1381 {
1382     RegionTextList tree;
1383     QRect contentRect(0, 0, pageWidth, pageHeight);
1384     const RegionText root(wordsWithCharacters, contentRect);
1385 
1386     // start the tree with the root, it is our only region at the start
1387     tree.push_back(root);
1388 
1389     int i = 0;
1390 
1391     // while traversing the tree has not been ended
1392     while (i < tree.length()) {
1393         const RegionText node = tree.at(i);
1394         QRect regionRect = node.area();
1395 
1396         /**
1397          * 1. calculation of projection profiles
1398          */
1399         // allocate the size of proj profiles and initialize with 0
1400         int size_proj_y = node.area().height();
1401         int size_proj_x = node.area().width();
1402         // dynamic memory allocation
1403         QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1404         QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1405 
1406         for (int j = 0; j < size_proj_y; ++j) {
1407             proj_on_yaxis[j] = 0;
1408         }
1409         for (int j = 0; j < size_proj_x; ++j) {
1410             proj_on_xaxis[j] = 0;
1411         }
1412 
1413         const QList<WordWithCharacters> list = node.text();
1414 
1415         // Calculate tcx and tcy locally for each new region
1416         int word_spacing, line_spacing, column_spacing;
1417         calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1418 
1419         const int tcx = word_spacing * 2;
1420         const int tcy = line_spacing * 2;
1421 
1422         int maxX = 0, maxY = 0;
1423         int avgX = 0;
1424         int count;
1425 
1426         // for every text in the region
1427         for (const WordWithCharacters &wwc : list) {
1428             TinyTextEntity *ent = wwc.word;
1429             const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1430 
1431             // calculate vertical projection profile proj_on_xaxis1
1432             for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1433                 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0) {
1434                     proj_on_xaxis[k - regionRect.left()] += entRect.height();
1435                 }
1436             }
1437 
1438             // calculate horizontal projection profile in the same way
1439             for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1440                 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0) {
1441                     proj_on_yaxis[k - regionRect.top()] += entRect.width();
1442                 }
1443             }
1444         }
1445 
1446         for (int j = 0; j < size_proj_y; ++j) {
1447             if (proj_on_yaxis[j] > maxY) {
1448                 maxY = proj_on_yaxis[j];
1449             }
1450         }
1451 
1452         avgX = count = 0;
1453         for (int j = 0; j < size_proj_x; ++j) {
1454             if (proj_on_xaxis[j] > maxX) {
1455                 maxX = proj_on_xaxis[j];
1456             }
1457             if (proj_on_xaxis[j]) {
1458                 count++;
1459                 avgX += proj_on_xaxis[j];
1460             }
1461         }
1462         if (count) {
1463             avgX /= count;
1464         }
1465 
1466         /**
1467          * 2. Cleanup Boundary White Spaces and removal of noise
1468          */
1469         int xbegin = 0, xend = size_proj_x - 1;
1470         int ybegin = 0, yend = size_proj_y - 1;
1471         while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0) {
1472             xbegin++;
1473         }
1474         while (xend >= 0 && proj_on_xaxis[xend] <= 0) {
1475             xend--;
1476         }
1477         while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0) {
1478             ybegin++;
1479         }
1480         while (yend >= 0 && proj_on_yaxis[yend] <= 0) {
1481             yend--;
1482         }
1483 
1484         // update the regionRect
1485         int old_left = regionRect.left(), old_top = regionRect.top();
1486         regionRect.setLeft(old_left + xbegin);
1487         regionRect.setRight(old_left + xend);
1488         regionRect.setTop(old_top + ybegin);
1489         regionRect.setBottom(old_top + yend);
1490 
1491         int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1492         for (int j = 0; j < size_proj_x; ++j) {
1493             proj_on_xaxis[j] -= tnx;
1494         }
1495         for (int j = 0; j < size_proj_y; ++j) {
1496             proj_on_yaxis[j] -= tny;
1497         }
1498 
1499         /**
1500          * 3. Find the Widest gap
1501          */
1502         int gap_hor = -1, pos_hor = -1;
1503         int begin = -1, end = -1;
1504 
1505         // find all hor_gaps and find the maximum between them
1506         for (int j = 1; j < size_proj_y; ++j) {
1507             // transition from white to black
1508             if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0) {
1509                 end = j;
1510             }
1511 
1512             // transition from black to white
1513             if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0) {
1514                 begin = j;
1515             }
1516 
1517             if (begin > 0 && end > 0 && end - begin > gap_hor) {
1518                 gap_hor = end - begin;
1519                 pos_hor = (end + begin) / 2;
1520                 begin = -1;
1521                 end = -1;
1522             }
1523         }
1524 
1525         begin = -1, end = -1;
1526         int gap_ver = -1, pos_ver = -1;
1527 
1528         // find all the ver_gaps and find the maximum between them
1529         for (int j = 1; j < size_proj_x; ++j) {
1530             // transition from white to black
1531             if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1532                 end = j;
1533             }
1534 
1535             // transition from black to white
1536             if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0) {
1537                 begin = j;
1538             }
1539 
1540             if (begin > 0 && end > 0 && end - begin > gap_ver) {
1541                 gap_ver = end - begin;
1542                 pos_ver = (end + begin) / 2;
1543                 begin = -1;
1544                 end = -1;
1545             }
1546         }
1547 
1548         int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1549         int gap_x = gap_ver, gap_y = gap_hor;
1550 
1551         /**
1552          * 4. Cut the region and make nodes (left,right) or (up,down)
1553          */
1554         bool cut_hor = false, cut_ver = false;
1555 
1556         // For horizontal cut
1557         const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1558         const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1559         const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1560 
1561         // For vertical Cut
1562         const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1563         const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1564         const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1565 
1566         if (gap_y >= gap_x && gap_y >= tcy) {
1567             cut_hor = true;
1568         } else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx) {
1569             cut_ver = true;
1570         } else if (gap_x >= gap_y && gap_x >= tcx) {
1571             cut_ver = true;
1572         } else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy) {
1573             cut_hor = true;
1574         } else {
1575             // no cut possible
1576             // we can now update the node rectangle with the shrinked rectangle
1577             RegionText tmpNode = tree.at(i);
1578             tmpNode.setArea(regionRect);
1579             tree.replace(i, tmpNode);
1580             i++;
1581             continue;
1582         }
1583 
1584         WordsWithCharacters list1, list2;
1585 
1586         // horizontal cut, topRect and bottomRect
1587         if (cut_hor) {
1588             for (const WordWithCharacters &word : list) {
1589                 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1590 
1591                 if (topRect.intersects(wordRect)) {
1592                     list1.append(word);
1593                 } else {
1594                     list2.append(word);
1595                 }
1596             }
1597 
1598             RegionText node1(list1, topRect);
1599             RegionText node2(list2, bottomRect);
1600 
1601             tree.replace(i, node1);
1602             tree.insert(i + 1, node2);
1603         }
1604 
1605         // vertical cut, leftRect and rightRect
1606         else if (cut_ver) {
1607             for (const WordWithCharacters &word : list) {
1608                 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1609 
1610                 if (leftRect.intersects(wordRect)) {
1611                     list1.append(word);
1612                 } else {
1613                     list2.append(word);
1614                 }
1615             }
1616 
1617             RegionText node1(list1, leftRect);
1618             RegionText node2(list2, rightRect);
1619 
1620             tree.replace(i, node1);
1621             tree.insert(i + 1, node2);
1622         }
1623     }
1624 
1625     return tree;
1626 }
1627 
1628 /**
1629  * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1630  */
1631 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1632 {
1633     /**
1634      * 1. Call makeAndSortLines before adding spaces in between words in a line
1635      * 2. Now add spaces between every two words in a line
1636      * 3. Finally, extract all the space separated texts from each region and return it
1637      */
1638 
1639     // Only change the texts under RegionTexts, not the area
1640     for (RegionText &tmpRegion : tree) {
1641         // Step 01
1642         QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1643 
1644         // Step 02
1645         for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1646             WordsWithCharacters &list = sortedLine.first;
1647             for (int k = 0; k < list.length(); k++) {
1648                 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1649                 if (k + 1 >= list.length()) {
1650                     break;
1651                 }
1652 
1653                 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1654                 const int space = area2.left() - area1.right();
1655 
1656                 if (space != 0) {
1657                     // Make a TinyTextEntity of string space and push it between it and it+1
1658                     const int left = area1.right();
1659                     const int right = area2.left();
1660                     const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1661                     const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1662 
1663                     const QString spaceStr(QStringLiteral(" "));
1664                     const QRect rect(QPoint(left, top), QPoint(right, bottom));
1665                     const NormalizedRect entRect(rect, pageWidth, pageHeight);
1666                     TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1667                     TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1668                     WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1669 
1670                     list.insert(k + 1, word);
1671 
1672                     // Skip the space
1673                     k++;
1674                 }
1675             }
1676         }
1677 
1678         WordsWithCharacters tmpList;
1679         for (const QPair<WordsWithCharacters, QRect> &sortedLine : std::as_const(sortedLines)) {
1680             tmpList += sortedLine.first;
1681         }
1682         tmpRegion.setText(tmpList);
1683     }
1684 
1685     // Step 03
1686     WordsWithCharacters tmp;
1687     for (const RegionText &tmpRegion : std::as_const(tree)) {
1688         tmp += tmpRegion.text();
1689     }
1690     return tmp;
1691 }
1692 
1693 /**
1694  * Correct the textOrder, all layout recognition works here
1695  */
1696 void TextPagePrivate::correctTextOrder()
1697 {
1698     // m_page->width() and m_page->height() are in pixels at
1699     // 100% zoom level, and thus depend on display DPI.
1700     // To avoid Okular failing on lowDPI displays,
1701     // we scale pageWidth and pageHeight so their sum equals 2000.
1702     const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1703     const int pageWidth = (int)(scalingFactor * m_page->width());
1704     const int pageHeight = (int)(scalingFactor * m_page->height());
1705 
1706     TextList characters = m_words;
1707 
1708     /**
1709      * Remove spaces from the text
1710      */
1711     removeSpace(&characters);
1712 
1713     /**
1714      * Construct words from characters
1715      */
1716     const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1717 
1718     /**
1719      * Make a XY Cut tree for segmentation of the texts
1720      */
1721     const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1722 
1723     /**
1724      * Add spaces to the word
1725      */
1726     const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1727 
1728     /**
1729      * Break the words into characters
1730      */
1731     TextList listOfCharacters;
1732     for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1733         delete word.word;
1734         listOfCharacters.append(word.characters);
1735     }
1736     setWordList(listOfCharacters);
1737 }
1738 
1739 TextEntity::List TextPage::words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
1740 {
1741     if (area && area->isNull()) {
1742         return TextEntity::List();
1743     }
1744 
1745     TextEntity::List ret;
1746     if (area) {
1747         for (const TinyTextEntity *te : std::as_const(d->m_words)) {
1748             if (b == AnyPixelTextAreaInclusionBehaviour) {
1749                 if (area->intersects(te->area)) {
1750                     ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1751                 }
1752             } else {
1753                 const NormalizedPoint center = te->area.center();
1754                 if (area->contains(center.x, center.y)) {
1755                     ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1756                 }
1757             }
1758         }
1759     } else {
1760         for (const TinyTextEntity *te : std::as_const(d->m_words)) {
1761             ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1762         }
1763     }
1764     return ret;
1765 }
1766 
1767 RegularAreaRect *TextPage::wordAt(const NormalizedPoint &p, QString *word) const
1768 {
1769     TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1770     TextList::ConstIterator it = itBegin;
1771     TextList::ConstIterator posIt = itEnd;
1772     for (; it != itEnd; ++it) {
1773         if ((*it)->area.contains(p.x, p.y)) {
1774             posIt = it;
1775             break;
1776         }
1777     }
1778     if (posIt != itEnd) {
1779         if ((*posIt)->text().simplified().isEmpty()) {
1780             return nullptr;
1781         }
1782         // Find the first TinyTextEntity of the word
1783         while (posIt != itBegin) {
1784             --posIt;
1785             const QString itText = (*posIt)->text();
1786             if (itText.right(1).at(0).isSpace()) {
1787                 if (itText.endsWith(QLatin1String("-\n"))) {
1788                     // Is an hyphenated word
1789                     // continue searching the start of the word back
1790                     continue;
1791                 }
1792 
1793                 if (itText == QLatin1String("\n") && posIt != itBegin) {
1794                     --posIt;
1795                     if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1796                         // Is an hyphenated word
1797                         // continue searching the start of the word back
1798                         continue;
1799                     }
1800                     ++posIt;
1801                 }
1802 
1803                 ++posIt;
1804                 break;
1805             }
1806         }
1807         RegularAreaRect *ret = new RegularAreaRect();
1808         QString foundWord;
1809         for (; posIt != itEnd; ++posIt) {
1810             const QString itText = (*posIt)->text();
1811             if (itText.simplified().isEmpty()) {
1812                 break;
1813             }
1814 
1815             ret->appendShape((*posIt)->area);
1816             foundWord += (*posIt)->text();
1817             if (itText.right(1).at(0).isSpace()) {
1818                 if (!foundWord.endsWith(QLatin1String("-\n"))) {
1819                     break;
1820                 }
1821             }
1822         }
1823 
1824         if (word) {
1825             *word = foundWord;
1826         }
1827         return ret;
1828     } else {
1829         return nullptr;
1830     }
1831 }