File indexing completed on 2024-04-14 05:45:47
0001 /* 0002 This file is part of the Okteta Core library, made within the KDE community. 0003 0004 SPDX-FileCopyrightText: 2005, 2008-2009 Friedrich W. H. Kossebau <kossebau@kde.org> 0005 0006 SPDX-License-Identifier: LGPL-2.1-only OR LGPL-3.0-only OR LicenseRef-KDE-Accepted-LGPL 0007 */ 0008 0009 #include "textbytearrayanalyzer.hpp" 0010 0011 // lib 0012 #include "abstractbytearraymodel.hpp" 0013 #include "character.hpp" 0014 #include "charcodec.hpp" 0015 0016 namespace Okteta { 0017 0018 class TextByteArrayAnalyzerPrivate 0019 { 0020 public: 0021 const AbstractByteArrayModel* const byteArrayModel; 0022 const CharCodec* const charCodec; 0023 0024 public: 0025 TextByteArrayAnalyzerPrivate(const AbstractByteArrayModel* byteArrayModel, const CharCodec* charCodec); 0026 }; 0027 0028 TextByteArrayAnalyzerPrivate::TextByteArrayAnalyzerPrivate(const AbstractByteArrayModel* byteArrayModel, const CharCodec* charCodec) 0029 : byteArrayModel(byteArrayModel) 0030 , charCodec(charCodec) 0031 {} 0032 0033 0034 TextByteArrayAnalyzer::TextByteArrayAnalyzer(const AbstractByteArrayModel* byteArrayModel, const CharCodec* charCodec) 0035 : d_ptr(new TextByteArrayAnalyzerPrivate(byteArrayModel, charCodec)) 0036 { 0037 } 0038 0039 TextByteArrayAnalyzer::~TextByteArrayAnalyzer() = default; 0040 0041 AddressRange TextByteArrayAnalyzer::wordSection(Address index) const 0042 { 0043 return isWordChar(index) ? 0044 AddressRange(indexOfWordStart(index), indexOfWordEnd(index)) : 0045 AddressRange(); 0046 } 0047 0048 bool TextByteArrayAnalyzer::isWordChar(Address index) const 0049 { 0050 Q_D(const TextByteArrayAnalyzer); 0051 0052 const Character decodedChar = d->charCodec->decode(d->byteArrayModel->byte(index)); 0053 return !decodedChar.isUndefined() && decodedChar.isLetterOrNumber(); 0054 } 0055 0056 Address TextByteArrayAnalyzer::indexOfPreviousWordStart(Address index) const 0057 { 0058 Q_D(const TextByteArrayAnalyzer); 0059 0060 const Size size = d->byteArrayModel->size(); 0061 // already at the start or can the result only be 0? 0062 if (index == 0 || size < 3) { 0063 return 0; 0064 } 0065 0066 // search in two rounds: first for the next char, than for the next nonchar 0067 // after that return the index of the one before 0068 bool lookingForFirstWordChar = false; 0069 for (; index > 0; --index) { 0070 if (!isWordChar(index - 1)) { 0071 if (!lookingForFirstWordChar) { 0072 continue; 0073 } 0074 return(index); 0075 } 0076 if (!lookingForFirstWordChar) { 0077 lookingForFirstWordChar = true; 0078 } 0079 } 0080 0081 return 0; 0082 } 0083 0084 Address TextByteArrayAnalyzer::indexOfNextWordStart(Address index) const 0085 { 0086 Q_D(const TextByteArrayAnalyzer); 0087 0088 const Size size = d->byteArrayModel->size(); 0089 bool lookingForFirstWordChar = false; 0090 for (; index < size; ++index) { 0091 if (isWordChar(index)) { 0092 if (!lookingForFirstWordChar) { 0093 continue; 0094 } 0095 return index; 0096 } 0097 if (!lookingForFirstWordChar) { 0098 lookingForFirstWordChar = true; 0099 } 0100 } 0101 0102 // if no more word found, go to the end 0103 return size; 0104 } 0105 0106 Address TextByteArrayAnalyzer::indexOfBeforeNextWordStart(Address index) const 0107 { 0108 Q_D(const TextByteArrayAnalyzer); 0109 0110 const Size size = d->byteArrayModel->size(); 0111 bool lookingForFirstWordChar = false; 0112 for (; index < size; ++index) { 0113 if (isWordChar(index)) { 0114 if (!lookingForFirstWordChar) { 0115 continue; 0116 } 0117 return index - 1; 0118 } 0119 if (!lookingForFirstWordChar) { 0120 lookingForFirstWordChar = true; 0121 } 0122 } 0123 0124 // if no more word found, go to the end 0125 return size - 1; 0126 } 0127 0128 Address TextByteArrayAnalyzer::indexOfWordStart(Address index) const 0129 { 0130 for (; index > 0; --index) { 0131 if (!isWordChar(index - 1)) { 0132 return(index); 0133 } 0134 } 0135 0136 return 0; 0137 } 0138 0139 Address TextByteArrayAnalyzer::indexOfWordEnd(Address index) const 0140 { 0141 Q_D(const TextByteArrayAnalyzer); 0142 0143 const Size size = d->byteArrayModel->size(); 0144 for (++index; index < size; ++index) { 0145 if (!isWordChar(index)) { 0146 return index - 1; 0147 } 0148 } 0149 0150 // word reaches the end 0151 return size - 1; 0152 } 0153 0154 Address TextByteArrayAnalyzer::indexOfLeftWordSelect(Address index) const 0155 { 0156 Q_D(const TextByteArrayAnalyzer); 0157 0158 // word at index? 0159 if (isWordChar(index)) { 0160 // search for word start to the left 0161 for (; index > 0; --index) { 0162 if (!isWordChar(index - 1)) { 0163 return index; 0164 } 0165 } 0166 0167 // reached start, so return it 0168 return 0; 0169 } 0170 0171 const Size size = d->byteArrayModel->size(); 0172 // search for word start to the right 0173 for (++index; index < size; ++index) { 0174 if (isWordChar(index)) { 0175 return index; 0176 } 0177 } 0178 0179 // word reaches the end, so step behind 0180 return size; 0181 } 0182 0183 Address TextByteArrayAnalyzer::indexOfRightWordSelect(Address index) const 0184 { 0185 Q_D(const TextByteArrayAnalyzer); 0186 0187 // TODO: should this check be here or with the caller? 0188 // the later would need another function to search the previous word end 0189 const Size size = d->byteArrayModel->size(); 0190 bool searchToLeft; 0191 if (index >= size) { 0192 index = size; 0193 searchToLeft = true; 0194 } else { 0195 searchToLeft = !isWordChar(index); 0196 } 0197 // no word at index? 0198 if (searchToLeft) { 0199 // search for word end to the left 0200 for (; index > 0; --index) { 0201 if (isWordChar(index - 1)) { 0202 return index; 0203 } 0204 } 0205 0206 // reached start, so return it 0207 return 0; 0208 } 0209 0210 for (++index; index < size; ++index) { 0211 // search for word end to the right 0212 if (!isWordChar(index)) { 0213 return index; 0214 } 0215 } 0216 0217 // word reaches the end, so step behind 0218 return size; 0219 } 0220 0221 /* 0222 Address TextByteArrayAnalyzer::indexOfBehindWordEnd( Address index ) const 0223 { 0224 // no word at index? 0225 return !::isWordChar(byte(index)) ? indexOfBehindLeftWordEnd(index) : indexOfBehindRightWordEnd(index+1) 0226 } 0227 0228 0229 Address TextByteArrayAnalyzer::indexOfBehindRightWordEnd( Address index ) const 0230 { 0231 for( ; index<size(); ++index ) 0232 { 0233 if( !::isWordChar(byte(index)) ) 0234 return index; 0235 } 0236 // word reaches the end, so step behind 0237 return size(); 0238 } 0239 0240 0241 Address TextByteArrayAnalyzer::indexOfBehindLeftWordEnd( Address index ) const 0242 { 0243 for( --index; index>=0; --index ) 0244 { 0245 if( ::isWordChar(byte(index)) ) 0246 return index+1; 0247 } 0248 // word reaches the end, so step behind 0249 return 0; 0250 } 0251 */ 0252 0253 QString TextByteArrayAnalyzer::text(Address index, Address lastIndex) const 0254 { 0255 Q_D(const TextByteArrayAnalyzer); 0256 0257 QString result; 0258 0259 const Address lastValidIndex = d->byteArrayModel->size() - 1; 0260 const Address behindLastIndex = 0261 ((lastIndex <0 || lastIndex> lastValidIndex) ? lastValidIndex : lastIndex) + 1; 0262 0263 const Size maxTextLength = behindLastIndex - index; 0264 result.reserve(maxTextLength); 0265 0266 for (; index < behindLastIndex; ++index) { 0267 const Character decodedChar = d->charCodec->decode(d->byteArrayModel->byte(index)); 0268 // TODO: handle line breaks, separators and spacing, controlled by flags given as parameter 0269 const bool isTextChar = (!decodedChar.isUndefined() && 0270 (decodedChar.isLetterOrNumber() || decodedChar.isSpace() || decodedChar.isPunct())); 0271 0272 if (!isTextChar) { 0273 break; 0274 } 0275 0276 result.append(decodedChar); 0277 } 0278 0279 result.squeeze(); 0280 0281 return result; 0282 } 0283 0284 }