File indexing completed on 2024-04-28 07:46:12
0001 /* 0002 SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #ifndef KATE_TEXTLOADER_H 0008 #define KATE_TEXTLOADER_H 0009 0010 #include <QCryptographicHash> 0011 #include <QFile> 0012 #include <QMimeDatabase> 0013 #include <QString> 0014 #include <QStringDecoder> 0015 0016 #include <KCompressionDevice> 0017 #include <KEncodingProber> 0018 0019 #include "katetextbuffer.h" 0020 0021 namespace Kate 0022 { 0023 /** 0024 * loader block size, load 256 kb at once per default 0025 * if file size is smaller, fall back to file size 0026 * must be a multiple of 2 0027 */ 0028 static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; 0029 0030 /** 0031 * File Loader, will handle reading of files + detecting encoding 0032 */ 0033 class TextLoader 0034 { 0035 public: 0036 /** 0037 * Construct file loader for given file. 0038 * @param filename file to open 0039 * @param proberType prober type 0040 */ 0041 TextLoader(const QString &filename, KEncodingProber::ProberType proberType) 0042 : m_eof(false) // default to not eof 0043 , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline 0044 , m_lastWasR(false) // we have not found a \r as last char 0045 , m_position(0) 0046 , m_lastLineStart(0) 0047 , m_eol(TextBuffer::eolUnknown) // no eol type detected atm 0048 , m_buffer(KATE_FILE_LOADER_BS, 0) 0049 , m_digest(QCryptographicHash::Sha1) 0050 , m_bomFound(false) 0051 , m_firstRead(true) 0052 , m_proberType(proberType) 0053 , m_fileSize(0) 0054 { 0055 // try to get mimetype for on the fly decompression, don't rely on filename! 0056 QFile testMime(filename); 0057 if (testMime.open(QIODevice::ReadOnly)) { 0058 m_fileSize = testMime.size(); 0059 } 0060 m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name(); 0061 0062 // construct filter device 0063 KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(m_mimeType); 0064 m_file = new KCompressionDevice(filename, compressionType); 0065 } 0066 0067 /** 0068 * Destructor 0069 */ 0070 ~TextLoader() 0071 { 0072 delete m_file; 0073 } 0074 0075 /** 0076 * open file with given codec 0077 * @param codec codec to use, if 0, will do some auto-detect or fallback 0078 * @return success 0079 */ 0080 bool open(const QString &codec) 0081 { 0082 m_codec = codec; 0083 m_eof = false; 0084 m_lastWasEndOfLine = true; 0085 m_lastWasR = false; 0086 m_position = 0; 0087 m_lastLineStart = 0; 0088 m_eol = TextBuffer::eolUnknown; 0089 m_text.clear(); 0090 m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData()); 0091 m_bomFound = false; 0092 m_firstRead = true; 0093 0094 // init the hash with the git header 0095 const QString header = QStringLiteral("blob %1").arg(m_fileSize); 0096 m_digest.reset(); 0097 m_digest.addData(QByteArray(header.toLatin1() + '\0')); 0098 0099 // if already opened, close the file... 0100 if (m_file->isOpen()) { 0101 m_file->close(); 0102 } 0103 0104 return m_file->open(QIODevice::ReadOnly); 0105 } 0106 0107 /** 0108 * end of file reached? 0109 * @return end of file reached 0110 */ 0111 bool eof() const 0112 { 0113 return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); 0114 } 0115 0116 /** 0117 * Detected end of line mode for this file. 0118 * Detected during reading, is valid after complete file is read. 0119 * @return eol mode of this file 0120 */ 0121 TextBuffer::EndOfLineMode eol() const 0122 { 0123 return m_eol; 0124 } 0125 0126 /** 0127 * BOM found? 0128 * @return byte order mark found? 0129 */ 0130 bool byteOrderMarkFound() const 0131 { 0132 return m_bomFound; 0133 } 0134 0135 /** 0136 * mime type used to create filter dev 0137 * @return mime-type of filter device 0138 */ 0139 const QString &mimeTypeForFilterDev() const 0140 { 0141 return m_mimeType; 0142 } 0143 0144 /** 0145 * internal Unicode data array 0146 * @return internal Unicode data 0147 */ 0148 const QChar *unicode() const 0149 { 0150 return m_text.unicode(); 0151 } 0152 0153 /** 0154 * Get codec for this loader 0155 * @return currently in use codec of this loader 0156 */ 0157 QString textCodec() const 0158 { 0159 return m_codec; 0160 } 0161 0162 /** 0163 * read a line, return length + offset in Unicode data 0164 * @param offset offset into internal Unicode data for read line 0165 * @param length length of read line 0166 * @return true if no encoding errors occurred 0167 */ 0168 bool readLine(int &offset, int &length) 0169 { 0170 length = 0; 0171 offset = 0; 0172 bool encodingError = false; 0173 0174 static const QLatin1Char cr(QLatin1Char('\r')); 0175 static const QLatin1Char lf(QLatin1Char('\n')); 0176 0177 /** 0178 * did we read two time but got no stuff? encoding error 0179 * fixes problem with one character latin-1 files, which lead to crash otherwise! 0180 * bug 272579 0181 */ 0182 bool failedToConvertOnce = false; 0183 /** 0184 * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true 0185 * BUG: 440359 0186 */ 0187 bool bomPreviouslyFound = m_bomFound; 0188 0189 /** 0190 * reading loop 0191 */ 0192 while (m_position <= m_text.length()) { 0193 if (m_position == m_text.length()) { 0194 // try to load more text if something is around 0195 if (!m_eof) { 0196 // kill the old lines... 0197 m_text.remove(0, m_lastLineStart); 0198 0199 // try to read new data 0200 const int c = m_file->read(m_buffer.data(), m_buffer.size()); 0201 0202 // if any text is there, append it.... 0203 if (c > 0) { 0204 // update hash sum 0205 m_digest.addData(QByteArrayView(m_buffer.data(), c)); 0206 0207 // detect byte order marks & codec for byte order marks on first read 0208 if (m_firstRead) { 0209 /** 0210 * if no codec given, do autodetection 0211 */ 0212 if (!m_converterState.isValid()) { 0213 /** 0214 * first: try to get HTML header encoding, includes BOM handling 0215 */ 0216 m_converterState = QStringDecoder::decoderForHtml(m_buffer); 0217 0218 /** 0219 * else: use KEncodingProber 0220 */ 0221 if (!m_converterState.isValid()) { 0222 KEncodingProber prober(m_proberType); 0223 prober.feed(m_buffer.constData(), c); 0224 0225 // we found codec with some confidence? 0226 if (prober.confidence() > 0.5) { 0227 m_converterState = QStringDecoder(prober.encoding().constData()); 0228 } 0229 } 0230 0231 // no codec, no chance, encoding error, else remember the codec name 0232 if (!m_converterState.isValid()) { 0233 return false; 0234 } 0235 } 0236 0237 // we want to convert the bom for later detection 0238 m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom); 0239 0240 // remember name, might have changed 0241 m_codec = QString::fromUtf8(m_converterState.name()); 0242 } 0243 0244 // detect broken encoding 0245 Q_ASSERT(m_converterState.isValid()); 0246 const QString unicode = m_converterState.decode(QByteArrayView(m_buffer.data(), c)); 0247 encodingError = encodingError || m_converterState.hasError(); 0248 0249 // check and remove bom 0250 if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) { 0251 m_bomFound = true; 0252 m_text.append(QStringView(unicode).last(unicode.size() - 1)); 0253 0254 // swapped BOM is encoding error 0255 encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped; 0256 } else { 0257 m_text.append(unicode); 0258 } 0259 m_firstRead = false; 0260 } 0261 0262 // is file completely read ? 0263 m_eof = (c == -1) || (c == 0); 0264 0265 // recalc current pos and last pos 0266 m_position -= m_lastLineStart; 0267 m_lastLineStart = 0; 0268 } 0269 0270 // oh oh, end of file, escape ! 0271 if (m_eof && (m_position == m_text.length())) { 0272 m_lastWasEndOfLine = false; 0273 0274 // line data 0275 offset = m_lastLineStart; 0276 length = m_position - m_lastLineStart; 0277 0278 m_lastLineStart = m_position; 0279 0280 return !encodingError && !failedToConvertOnce; 0281 } 0282 0283 // empty? try again 0284 if (m_position == m_text.length()) { 0285 if (!bomPreviouslyFound && m_bomFound) { 0286 // BOM was processed above, so we didn't fail to convert 0287 bomPreviouslyFound = true; 0288 } else { 0289 failedToConvertOnce = true; 0290 } 0291 continue; 0292 } 0293 } 0294 for (; m_position < m_text.length(); m_position++) { 0295 QChar current_char = m_text.at(m_position); 0296 if (current_char == lf) { 0297 m_lastWasEndOfLine = true; 0298 0299 if (m_lastWasR) { 0300 m_lastLineStart++; 0301 m_lastWasR = false; 0302 m_eol = TextBuffer::eolDos; 0303 } else { 0304 // line data 0305 offset = m_lastLineStart; 0306 length = m_position - m_lastLineStart; 0307 0308 m_lastLineStart = m_position + 1; 0309 m_position++; 0310 0311 // only win, if not dos! 0312 if (m_eol != TextBuffer::eolDos) { 0313 m_eol = TextBuffer::eolUnix; 0314 } 0315 0316 return !encodingError; 0317 } 0318 } else if (current_char == cr) { 0319 m_lastWasEndOfLine = true; 0320 m_lastWasR = true; 0321 0322 // line data 0323 offset = m_lastLineStart; 0324 length = m_position - m_lastLineStart; 0325 0326 m_lastLineStart = m_position + 1; 0327 m_position++; 0328 0329 // should only win of first time! 0330 if (m_eol == TextBuffer::eolUnknown) { 0331 m_eol = TextBuffer::eolMac; 0332 } 0333 0334 return !encodingError; 0335 } else if (current_char == QChar::LineSeparator) { 0336 m_lastWasEndOfLine = true; 0337 0338 // line data 0339 offset = m_lastLineStart; 0340 length = m_position - m_lastLineStart; 0341 0342 m_lastLineStart = m_position + 1; 0343 m_position++; 0344 0345 return !encodingError; 0346 } else { 0347 m_lastWasEndOfLine = false; 0348 m_lastWasR = false; 0349 } 0350 } 0351 } 0352 0353 return !encodingError; 0354 } 0355 0356 QByteArray digest() 0357 { 0358 return m_digest.result(); 0359 } 0360 0361 private: 0362 QString m_codec; 0363 bool m_eof; 0364 bool m_lastWasEndOfLine; 0365 bool m_lastWasR; 0366 int m_position; 0367 int m_lastLineStart; 0368 TextBuffer::EndOfLineMode m_eol; 0369 QString m_mimeType; 0370 QIODevice *m_file; 0371 QByteArray m_buffer; 0372 QCryptographicHash m_digest; 0373 QString m_text; 0374 QStringDecoder m_converterState; 0375 bool m_bomFound; 0376 bool m_firstRead; 0377 KEncodingProber::ProberType m_proberType; 0378 quint64 m_fileSize; 0379 }; 0380 0381 } 0382 0383 #endif