okteta/core/textbytearrayanalyzer.cpp

0001 /*
0002     This file is part of the Okteta Core library, made within the KDE community.
0003
0004     SPDX-FileCopyrightText: 2005, 2008-2009 Friedrich W. H. Kossebau <kossebau@kde.org>
0005
0006     SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL
0007 */
0008
0009 #include "textbytearrayanalyzer.hpp"
0010
0011 // lib
0012 #include "abstractbytearraymodel.hpp"
0013 #include "character.hpp"
0014 #include "charcodec.hpp"
0015
0016 namespace Okteta {
0017
0018 class TextByteArrayAnalyzerPrivate
0019 {
0020 public:
0021     const AbstractByteArrayModel* const byteArrayModel;
0022     const CharCodec* const charCodec;
0023
0024 public:
0025     TextByteArrayAnalyzerPrivate(const AbstractByteArrayModel* byteArrayModel, const CharCodec* charCodec);
0026 };
0027
0028 TextByteArrayAnalyzerPrivate::TextByteArrayAnalyzerPrivate(const AbstractByteArrayModel* byteArrayModel, const CharCodec* charCodec)
0029     : byteArrayModel(byteArrayModel)
0030     , charCodec(charCodec)
0031 {}
0032
0033
0034 TextByteArrayAnalyzer::TextByteArrayAnalyzer(const AbstractByteArrayModel* byteArrayModel, const CharCodec* charCodec)
0035     : d_ptr(new TextByteArrayAnalyzerPrivate(byteArrayModel, charCodec))
0036 {
0037 }
0038
0039 TextByteArrayAnalyzer::~TextByteArrayAnalyzer() = default;
0040
0041 AddressRange TextByteArrayAnalyzer::wordSection(Address index) const
0042 {
0043     return isWordChar(index) ?
0044            AddressRange(indexOfWordStart(index), indexOfWordEnd(index)) :
0045            AddressRange();
0046 }
0047
0048 bool TextByteArrayAnalyzer::isWordChar(Address index) const
0049 {
0050     Q_D(const TextByteArrayAnalyzer);
0051
0052     const Character decodedChar = d->charCodec->decode(d->byteArrayModel->byte(index));
0053     return !decodedChar.isUndefined() && decodedChar.isLetterOrNumber();
0054 }
0055
0056 Address TextByteArrayAnalyzer::indexOfPreviousWordStart(Address index) const
0057 {
0058     Q_D(const TextByteArrayAnalyzer);
0059
0060     const Size size = d->byteArrayModel->size();
0061     // already at the start or can the result only be 0?
0062     if (index == 0 || size < 3) {
0063         return 0;
0064     }
0065
0066     // search in two rounds: first for the next char, than for the next nonchar
0067     // after that return the index of the one before
0068     bool lookingForFirstWordChar = false;
0069     for (; index > 0; --index) {
0070         if (!isWordChar(index - 1)) {
0071             if (!lookingForFirstWordChar) {
0072                 continue;
0073             }
0074             return(index);
0075         }
0076         if (!lookingForFirstWordChar) {
0077             lookingForFirstWordChar = true;
0078         }
0079     }
0080
0081     return 0;
0082 }
0083
0084 Address TextByteArrayAnalyzer::indexOfNextWordStart(Address index) const
0085 {
0086     Q_D(const TextByteArrayAnalyzer);
0087
0088     const Size size = d->byteArrayModel->size();
0089     bool lookingForFirstWordChar = false;
0090     for (; index < size; ++index) {
0091         if (isWordChar(index)) {
0092             if (!lookingForFirstWordChar) {
0093                 continue;
0094             }
0095             return index;
0096         }
0097         if (!lookingForFirstWordChar) {
0098             lookingForFirstWordChar = true;
0099         }
0100     }
0101
0102     // if no more word found, go to the end
0103     return size;
0104 }
0105
0106 Address TextByteArrayAnalyzer::indexOfBeforeNextWordStart(Address index) const
0107 {
0108     Q_D(const TextByteArrayAnalyzer);
0109
0110     const Size size = d->byteArrayModel->size();
0111     bool lookingForFirstWordChar = false;
0112     for (; index < size; ++index) {
0113         if (isWordChar(index)) {
0114             if (!lookingForFirstWordChar) {
0115                 continue;
0116             }
0117             return index - 1;
0118         }
0119         if (!lookingForFirstWordChar) {
0120             lookingForFirstWordChar = true;
0121         }
0122     }
0123
0124     // if no more word found, go to the end
0125     return size - 1;
0126 }
0127
0128 Address TextByteArrayAnalyzer::indexOfWordStart(Address index) const
0129 {
0130     for (; index > 0; --index) {
0131         if (!isWordChar(index - 1)) {
0132             return(index);
0133         }
0134     }
0135
0136     return 0;
0137 }
0138
0139 Address TextByteArrayAnalyzer::indexOfWordEnd(Address index) const
0140 {
0141     Q_D(const TextByteArrayAnalyzer);
0142
0143     const Size size = d->byteArrayModel->size();
0144     for (++index; index < size; ++index) {
0145         if (!isWordChar(index)) {
0146             return index - 1;
0147         }
0148     }
0149
0150     // word reaches the end
0151     return size - 1;
0152 }
0153
0154 Address TextByteArrayAnalyzer::indexOfLeftWordSelect(Address index) const
0155 {
0156     Q_D(const TextByteArrayAnalyzer);
0157
0158     // word at index?
0159     if (isWordChar(index)) {
0160         // search for word start to the left
0161         for (; index > 0; --index) {
0162             if (!isWordChar(index - 1)) {
0163                 return index;
0164             }
0165         }
0166
0167         // reached start, so return it
0168         return 0;
0169     }
0170
0171     const Size size = d->byteArrayModel->size();
0172     // search for word start to the right
0173     for (++index; index < size; ++index) {
0174         if (isWordChar(index)) {
0175             return index;
0176         }
0177     }
0178
0179     // word reaches the end, so step behind
0180     return size;
0181 }
0182
0183 Address TextByteArrayAnalyzer::indexOfRightWordSelect(Address index) const
0184 {
0185     Q_D(const TextByteArrayAnalyzer);
0186
0187     // TODO: should this check be here or with the caller?
0188     // the later would need another function to search the previous word end
0189     const Size size = d->byteArrayModel->size();
0190     bool searchToLeft;
0191     if (index >= size) {
0192         index = size;
0193         searchToLeft = true;
0194     } else {
0195         searchToLeft = !isWordChar(index);
0196     }
0197     // no word at index?
0198     if (searchToLeft) {
0199         // search for word end to the left
0200         for (; index > 0; --index) {
0201             if (isWordChar(index - 1)) {
0202                 return index;
0203             }
0204         }
0205
0206         // reached start, so return it
0207         return 0;
0208     }
0209
0210     for (++index; index < size; ++index) {
0211         // search for word end to the right
0212         if (!isWordChar(index)) {
0213             return index;
0214         }
0215     }
0216
0217     // word reaches the end, so step behind
0218     return size;
0219 }
0220
0221 /*
0222 Address TextByteArrayAnalyzer::indexOfBehindWordEnd( Address index ) const
0223 {
0224    // no word at index?
0225    return !::isWordChar(byte(index)) ? indexOfBehindLeftWordEnd(index) : indexOfBehindRightWordEnd(index+1)
0226 }
0227
0228
0229 Address TextByteArrayAnalyzer::indexOfBehindRightWordEnd( Address index ) const
0230 {
0231    for( ; index<size(); ++index )
0232    {
0233         if( !::isWordChar(byte(index)) )
0234             return index;
0235    }
0236    // word reaches the end, so step behind
0237    return size();
0238 }
0239
0240
0241 Address TextByteArrayAnalyzer::indexOfBehindLeftWordEnd( Address index ) const
0242 {
0243    for( --index; index>=0; --index )
0244    {
0245         if( ::isWordChar(byte(index)) )
0246             return index+1;
0247    }
0248    // word reaches the end, so step behind
0249    return 0;
0250 }
0251 */
0252
0253 QString TextByteArrayAnalyzer::text(Address index, Address lastIndex) const
0254 {
0255     Q_D(const TextByteArrayAnalyzer);
0256
0257     QString result;
0258
0259     const Address lastValidIndex = d->byteArrayModel->size() - 1;
0260     const Address behindLastIndex =
0261         ((lastIndex <0 || lastIndex> lastValidIndex) ? lastValidIndex : lastIndex) + 1;
0262
0263     const Size maxTextLength = behindLastIndex - index;
0264     result.reserve(maxTextLength);
0265
0266     for (; index < behindLastIndex; ++index) {
0267         const Character decodedChar = d->charCodec->decode(d->byteArrayModel->byte(index));
0268         // TODO: handle line breaks, separators and spacing, controlled by flags given as parameter
0269         const bool isTextChar = (!decodedChar.isUndefined() &&
0270                                  (decodedChar.isLetterOrNumber() || decodedChar.isSpace() || decodedChar.isPunct()));
0271
0272         if (!isTextChar) {
0273             break;
0274         }
0275
0276         result.append(decodedChar);
0277     }
0278
0279     result.squeeze();
0280
0281     return result;
0282 }
0283
0284 }