File indexing completed on 2023-09-24 04:11:28
0001 /* 0002 SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org> 0003 0004 SPDX-License-Identifier: LGPL-2.0-or-later 0005 */ 0006 0007 #ifndef KATE_TEXTLOADER_H 0008 #define KATE_TEXTLOADER_H 0009 0010 #include <QCryptographicHash> 0011 #include <QFile> 0012 #include <QMimeDatabase> 0013 #include <QString> 0014 0015 // on the fly compression 0016 #include <KCompressionDevice> 0017 0018 namespace Kate 0019 { 0020 /** 0021 * loader block size, load 256 kb at once per default 0022 * if file size is smaller, fall back to file size 0023 * must be a multiple of 2 0024 */ 0025 static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; 0026 0027 /** 0028 * File Loader, will handle reading of files + detecting encoding 0029 */ 0030 class TextLoader 0031 { 0032 public: 0033 /** 0034 * Construct file loader for given file. 0035 * @param filename file to open 0036 * @param proberType prober type 0037 */ 0038 TextLoader(const QString &filename, KEncodingProber::ProberType proberType) 0039 : m_codec(nullptr) 0040 , m_eof(false) // default to not eof 0041 , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline 0042 , m_lastWasR(false) // we have not found a \r as last char 0043 , m_position(0) 0044 , m_lastLineStart(0) 0045 , m_eol(TextBuffer::eolUnknown) // no eol type detected atm 0046 , m_buffer(KATE_FILE_LOADER_BS, 0) 0047 , m_digest(QCryptographicHash::Sha1) 0048 , m_converterState(nullptr) 0049 , m_bomFound(false) 0050 , m_firstRead(true) 0051 , m_proberType(proberType) 0052 , m_fileSize(0) 0053 { 0054 // try to get mimetype for on the fly decompression, don't rely on filename! 0055 QFile testMime(filename); 0056 if (testMime.open(QIODevice::ReadOnly)) { 0057 m_fileSize = testMime.size(); 0058 } 0059 m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name(); 0060 0061 // construct filter device 0062 KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(m_mimeType); 0063 m_file = new KCompressionDevice(filename, compressionType); 0064 } 0065 0066 /** 0067 * Destructor 0068 */ 0069 ~TextLoader() 0070 { 0071 delete m_file; 0072 delete m_converterState; 0073 } 0074 0075 /** 0076 * open file with given codec 0077 * @param codec codec to use, if 0, will do some auto-detect or fallback 0078 * @return success 0079 */ 0080 bool open(QTextCodec *codec) 0081 { 0082 m_codec = codec; 0083 m_eof = false; 0084 m_lastWasEndOfLine = true; 0085 m_lastWasR = false; 0086 m_position = 0; 0087 m_lastLineStart = 0; 0088 m_eol = TextBuffer::eolUnknown; 0089 m_text.clear(); 0090 delete m_converterState; 0091 m_converterState = new QTextCodec::ConverterState(QTextCodec::DefaultConversion); 0092 m_bomFound = false; 0093 m_firstRead = true; 0094 0095 // init the hash with the git header 0096 const QString header = QStringLiteral("blob %1").arg(m_fileSize); 0097 m_digest.reset(); 0098 m_digest.addData(QByteArray(header.toLatin1() + '\0')); 0099 0100 // if already opened, close the file... 0101 if (m_file->isOpen()) { 0102 m_file->close(); 0103 } 0104 0105 return m_file->open(QIODevice::ReadOnly); 0106 } 0107 0108 /** 0109 * end of file reached? 0110 * @return end of file reached 0111 */ 0112 bool eof() const 0113 { 0114 return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); 0115 } 0116 0117 /** 0118 * Detected end of line mode for this file. 0119 * Detected during reading, is valid after complete file is read. 0120 * @return eol mode of this file 0121 */ 0122 TextBuffer::EndOfLineMode eol() const 0123 { 0124 return m_eol; 0125 } 0126 0127 /** 0128 * BOM found? 0129 * @return byte order mark found? 0130 */ 0131 bool byteOrderMarkFound() const 0132 { 0133 return m_bomFound; 0134 } 0135 0136 /** 0137 * mime type used to create filter dev 0138 * @return mime-type of filter device 0139 */ 0140 const QString &mimeTypeForFilterDev() const 0141 { 0142 return m_mimeType; 0143 } 0144 0145 /** 0146 * internal Unicode data array 0147 * @return internal Unicode data 0148 */ 0149 const QChar *unicode() const 0150 { 0151 return m_text.unicode(); 0152 } 0153 0154 /** 0155 * Get codec for this loader 0156 * @return currently in use codec of this loader 0157 */ 0158 QTextCodec *textCodec() const 0159 { 0160 return m_codec; 0161 } 0162 0163 /** 0164 * read a line, return length + offset in Unicode data 0165 * @param offset offset into internal Unicode data for read line 0166 * @param length length of read line 0167 * @return true if no encoding errors occurred 0168 */ 0169 bool readLine(int &offset, int &length) 0170 { 0171 length = 0; 0172 offset = 0; 0173 bool encodingError = false; 0174 0175 static const QLatin1Char cr(QLatin1Char('\r')); 0176 static const QLatin1Char lf(QLatin1Char('\n')); 0177 0178 /** 0179 * did we read two time but got no stuff? encoding error 0180 * fixes problem with one character latin-1 files, which lead to crash otherwise! 0181 * bug 272579 0182 */ 0183 bool failedToConvertOnce = false; 0184 /** 0185 * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true 0186 * BUG: 440359 0187 */ 0188 bool bomPreviouslyFound = m_bomFound; 0189 0190 /** 0191 * reading loop 0192 */ 0193 while (m_position <= m_text.length()) { 0194 if (m_position == m_text.length()) { 0195 // try to load more text if something is around 0196 if (!m_eof) { 0197 // kill the old lines... 0198 m_text.remove(0, m_lastLineStart); 0199 0200 // try to read new data 0201 const int c = m_file->read(m_buffer.data(), m_buffer.size()); 0202 0203 // if any text is there, append it.... 0204 if (c > 0) { 0205 // update hash sum 0206 m_digest.addData(m_buffer.data(), c); 0207 0208 // detect byte order marks & codec for byte order marks on first read 0209 int bomBytes = 0; 0210 if (m_firstRead) { 0211 // use first 16 bytes max to allow BOM detection of codec 0212 QByteArray bom(m_buffer.data(), qMin(16, c)); 0213 QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText(bom, nullptr); 0214 0215 // if codecForByteOrderMark != null, we found a BOM! 0216 // BUT we only capture BOM if no codec was set, or the BOM encodes the same codec as m_codec. 0217 // These additional checks are necessary so that the (coincidentally matching) BOM characters won't be eaten for non-UTF encodings 0218 // TODO: support BOMs for other encodings? (see e.g. https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding) 0219 if (codecForByteOrderMark && (!m_codec || codecForByteOrderMark->mibEnum() == m_codec->mibEnum())) { 0220 m_bomFound = true; 0221 0222 // eat away the different boms! 0223 const int mib = codecForByteOrderMark->mibEnum(); 0224 if (mib == 106) { // utf8 0225 bomBytes = 3; 0226 } else if (mib == 1013 || mib == 1014 || mib == 1015) { // utf16 0227 bomBytes = 2; 0228 } else if (mib == 1017 || mib == 1018 || mib == 1019) { // utf32 0229 bomBytes = 4; 0230 } 0231 } 0232 0233 /** 0234 * if no codec given, do autodetection 0235 */ 0236 if (!m_codec) { 0237 /** 0238 * byte order said something about encoding? 0239 */ 0240 if (codecForByteOrderMark) { 0241 m_codec = codecForByteOrderMark; 0242 } else { 0243 /** 0244 * no Unicode BOM found, trigger prober 0245 */ 0246 0247 /** 0248 * first: try to get HTML header encoding 0249 */ 0250 if (QTextCodec *codecForHtml = QTextCodec::codecForHtml(m_buffer, nullptr)) { 0251 m_codec = codecForHtml; 0252 } 0253 0254 /** 0255 * else: use KEncodingProber 0256 */ 0257 else { 0258 KEncodingProber prober(m_proberType); 0259 prober.feed(m_buffer.constData(), c); 0260 0261 // we found codec with some confidence? 0262 if (prober.confidence() > 0.5) { 0263 m_codec = QTextCodec::codecForName(prober.encoding()); 0264 } 0265 } 0266 0267 // no codec, no chance, encoding error 0268 if (!m_codec) { 0269 return false; 0270 } 0271 } 0272 } 0273 0274 m_firstRead = false; 0275 } 0276 0277 // detect broken encoding, we did before use QTextCodec::ConvertInvalidToNull and check for 0 chars 0278 // this lead to issues with files containing 0 chars, therefore use the invalidChars field of the state 0279 Q_ASSERT(m_codec); 0280 QString unicode = m_codec->toUnicode(m_buffer.constData() + bomBytes, c - bomBytes, m_converterState); 0281 encodingError = encodingError || m_converterState->invalidChars; 0282 m_text.append(unicode); 0283 } 0284 0285 // is file completely read ? 0286 m_eof = (c == -1) || (c == 0); 0287 0288 // recalc current pos and last pos 0289 m_position -= m_lastLineStart; 0290 m_lastLineStart = 0; 0291 } 0292 0293 // oh oh, end of file, escape ! 0294 if (m_eof && (m_position == m_text.length())) { 0295 m_lastWasEndOfLine = false; 0296 0297 // line data 0298 offset = m_lastLineStart; 0299 length = m_position - m_lastLineStart; 0300 0301 m_lastLineStart = m_position; 0302 0303 return !encodingError && !failedToConvertOnce; 0304 } 0305 0306 // empty? try again 0307 if (m_position == m_text.length()) { 0308 if (!bomPreviouslyFound && m_bomFound) { 0309 // BOM was processed above, so we didn't fail to convert 0310 bomPreviouslyFound = true; 0311 } else { 0312 failedToConvertOnce = true; 0313 } 0314 continue; 0315 } 0316 } 0317 0318 QChar current_char = m_text.at(m_position); 0319 if (current_char == lf) { 0320 m_lastWasEndOfLine = true; 0321 0322 if (m_lastWasR) { 0323 m_lastLineStart++; 0324 m_lastWasR = false; 0325 m_eol = TextBuffer::eolDos; 0326 } else { 0327 // line data 0328 offset = m_lastLineStart; 0329 length = m_position - m_lastLineStart; 0330 0331 m_lastLineStart = m_position + 1; 0332 m_position++; 0333 0334 // only win, if not dos! 0335 if (m_eol != TextBuffer::eolDos) { 0336 m_eol = TextBuffer::eolUnix; 0337 } 0338 0339 return !encodingError; 0340 } 0341 } else if (current_char == cr) { 0342 m_lastWasEndOfLine = true; 0343 m_lastWasR = true; 0344 0345 // line data 0346 offset = m_lastLineStart; 0347 length = m_position - m_lastLineStart; 0348 0349 m_lastLineStart = m_position + 1; 0350 m_position++; 0351 0352 // should only win of first time! 0353 if (m_eol == TextBuffer::eolUnknown) { 0354 m_eol = TextBuffer::eolMac; 0355 } 0356 0357 return !encodingError; 0358 } else if (current_char == QChar::LineSeparator) { 0359 m_lastWasEndOfLine = true; 0360 0361 // line data 0362 offset = m_lastLineStart; 0363 length = m_position - m_lastLineStart; 0364 0365 m_lastLineStart = m_position + 1; 0366 m_position++; 0367 0368 return !encodingError; 0369 } else { 0370 m_lastWasEndOfLine = false; 0371 m_lastWasR = false; 0372 } 0373 0374 m_position++; 0375 } 0376 0377 return !encodingError; 0378 } 0379 0380 QByteArray digest() 0381 { 0382 return m_digest.result(); 0383 } 0384 0385 private: 0386 QTextCodec *m_codec; 0387 bool m_eof; 0388 bool m_lastWasEndOfLine; 0389 bool m_lastWasR; 0390 int m_position; 0391 int m_lastLineStart; 0392 TextBuffer::EndOfLineMode m_eol; 0393 QString m_mimeType; 0394 QIODevice *m_file; 0395 QByteArray m_buffer; 0396 QCryptographicHash m_digest; 0397 QString m_text; 0398 QTextCodec::ConverterState *m_converterState; 0399 bool m_bomFound; 0400 bool m_firstRead; 0401 KEncodingProber::ProberType m_proberType; 0402 quint64 m_fileSize; 0403 }; 0404 0405 } 0406 0407 #endif