src/buffer/katetextloader.h

0001 /*
0002     SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org>
0003
0004     SPDX-License-Identifier: LGPL-2.0-or-later
0005 */
0006
0007 #ifndef KATE_TEXTLOADER_H
0008 #define KATE_TEXTLOADER_H
0009
0010 #include <QCryptographicHash>
0011 #include <QFile>
0012 #include <QMimeDatabase>
0013 #include <QString>
0014
0015 // on the fly compression
0016 #include <KCompressionDevice>
0017
0018 namespace Kate
0019 {
0020 /**
0021  * loader block size, load 256 kb at once per default
0022  * if file size is smaller, fall back to file size
0023  * must be a multiple of 2
0024  */
0025 static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
0026
0027 /**
0028  * File Loader, will handle reading of files + detecting encoding
0029  */
0030 class TextLoader
0031 {
0032 public:
0033     /**
0034      * Construct file loader for given file.
0035      * @param filename file to open
0036      * @param proberType prober type
0037      */
0038     TextLoader(const QString &filename, KEncodingProber::ProberType proberType)
0039         : m_codec(nullptr)
0040         , m_eof(false) // default to not eof
0041         , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
0042         , m_lastWasR(false) // we have not found a \r as last char
0043         , m_position(0)
0044         , m_lastLineStart(0)
0045         , m_eol(TextBuffer::eolUnknown) // no eol type detected atm
0046         , m_buffer(KATE_FILE_LOADER_BS, 0)
0047         , m_digest(QCryptographicHash::Sha1)
0048         , m_converterState(nullptr)
0049         , m_bomFound(false)
0050         , m_firstRead(true)
0051         , m_proberType(proberType)
0052         , m_fileSize(0)
0053     {
0054         // try to get mimetype for on the fly decompression, don't rely on filename!
0055         QFile testMime(filename);
0056         if (testMime.open(QIODevice::ReadOnly)) {
0057             m_fileSize = testMime.size();
0058         }
0059         m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(filename, &testMime).name();
0060
0061         // construct filter device
0062         KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(m_mimeType);
0063         m_file = new KCompressionDevice(filename, compressionType);
0064     }
0065
0066     /**
0067      * Destructor
0068      */
0069     ~TextLoader()
0070     {
0071         delete m_file;
0072         delete m_converterState;
0073     }
0074
0075     /**
0076      * open file with given codec
0077      * @param codec codec to use, if 0, will do some auto-detect or fallback
0078      * @return success
0079      */
0080     bool open(QTextCodec *codec)
0081     {
0082         m_codec = codec;
0083         m_eof = false;
0084         m_lastWasEndOfLine = true;
0085         m_lastWasR = false;
0086         m_position = 0;
0087         m_lastLineStart = 0;
0088         m_eol = TextBuffer::eolUnknown;
0089         m_text.clear();
0090         delete m_converterState;
0091         m_converterState = new QTextCodec::ConverterState(QTextCodec::DefaultConversion);
0092         m_bomFound = false;
0093         m_firstRead = true;
0094
0095         // init the hash with the git header
0096         const QString header = QStringLiteral("blob %1").arg(m_fileSize);
0097         m_digest.reset();
0098         m_digest.addData(QByteArray(header.toLatin1() + '\0'));
0099
0100         // if already opened, close the file...
0101         if (m_file->isOpen()) {
0102             m_file->close();
0103         }
0104
0105         return m_file->open(QIODevice::ReadOnly);
0106     }
0107
0108     /**
0109      * end of file reached?
0110      * @return end of file reached
0111      */
0112     bool eof() const
0113     {
0114         return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
0115     }
0116
0117     /**
0118      * Detected end of line mode for this file.
0119      * Detected during reading, is valid after complete file is read.
0120      * @return eol mode of this file
0121      */
0122     TextBuffer::EndOfLineMode eol() const
0123     {
0124         return m_eol;
0125     }
0126
0127     /**
0128      * BOM found?
0129      * @return byte order mark found?
0130      */
0131     bool byteOrderMarkFound() const
0132     {
0133         return m_bomFound;
0134     }
0135
0136     /**
0137      * mime type used to create filter dev
0138      * @return mime-type of filter device
0139      */
0140     const QString &mimeTypeForFilterDev() const
0141     {
0142         return m_mimeType;
0143     }
0144
0145     /**
0146      * internal Unicode data array
0147      * @return internal Unicode data
0148      */
0149     const QChar *unicode() const
0150     {
0151         return m_text.unicode();
0152     }
0153
0154     /**
0155      * Get codec for this loader
0156      * @return currently in use codec of this loader
0157      */
0158     QTextCodec *textCodec() const
0159     {
0160         return m_codec;
0161     }
0162
0163     /**
0164      * read a line, return length + offset in Unicode data
0165      * @param offset offset into internal Unicode data for read line
0166      * @param length length of read line
0167      * @return true if no encoding errors occurred
0168      */
0169     bool readLine(int &offset, int &length)
0170     {
0171         length = 0;
0172         offset = 0;
0173         bool encodingError = false;
0174
0175         static const QLatin1Char cr(QLatin1Char('\r'));
0176         static const QLatin1Char lf(QLatin1Char('\n'));
0177
0178         /**
0179          * did we read two time but got no stuff? encoding error
0180          * fixes problem with one character latin-1 files, which lead to crash otherwise!
0181          * bug 272579
0182          */
0183         bool failedToConvertOnce = false;
0184         /**
0185          * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
0186          * BUG: 440359
0187          */
0188         bool bomPreviouslyFound = m_bomFound;
0189
0190         /**
0191          * reading loop
0192          */
0193         while (m_position <= m_text.length()) {
0194             if (m_position == m_text.length()) {
0195                 // try to load more text if something is around
0196                 if (!m_eof) {
0197                     // kill the old lines...
0198                     m_text.remove(0, m_lastLineStart);
0199
0200                     // try to read new data
0201                     const int c = m_file->read(m_buffer.data(), m_buffer.size());
0202
0203                     // if any text is there, append it....
0204                     if (c > 0) {
0205                         // update hash sum
0206                         m_digest.addData(m_buffer.data(), c);
0207
0208                         // detect byte order marks & codec for byte order marks on first read
0209                         int bomBytes = 0;
0210                         if (m_firstRead) {
0211                             // use first 16 bytes max to allow BOM detection of codec
0212                             QByteArray bom(m_buffer.data(), qMin(16, c));
0213                             QTextCodec *codecForByteOrderMark = QTextCodec::codecForUtfText(bom, nullptr);
0214
0215                             // if codecForByteOrderMark != null, we found a BOM!
0216                             // BUT we only capture BOM if no codec was set, or the BOM encodes the same codec as m_codec.
0217                             // These additional checks are necessary so that the (coincidentally matching) BOM characters won't be eaten for non-UTF encodings
0218                             // TODO: support BOMs for other encodings? (see e.g. https://en.wikipedia.org/wiki/Byte_order_mark#Byte_order_marks_by_encoding)
0219                             if (codecForByteOrderMark && (!m_codec || codecForByteOrderMark->mibEnum() == m_codec->mibEnum())) {
0220                                 m_bomFound = true;
0221
0222                                 // eat away the different boms!
0223                                 const int mib = codecForByteOrderMark->mibEnum();
0224                                 if (mib == 106) { // utf8
0225                                     bomBytes = 3;
0226                                 } else if (mib == 1013 || mib == 1014 || mib == 1015) { // utf16
0227                                     bomBytes = 2;
0228                                 } else if (mib == 1017 || mib == 1018 || mib == 1019) { // utf32
0229                                     bomBytes = 4;
0230                                 }
0231                             }
0232
0233                             /**
0234                              * if no codec given, do autodetection
0235                              */
0236                             if (!m_codec) {
0237                                 /**
0238                                  * byte order said something about encoding?
0239                                  */
0240                                 if (codecForByteOrderMark) {
0241                                     m_codec = codecForByteOrderMark;
0242                                 } else {
0243                                     /**
0244                                      * no Unicode BOM found, trigger prober
0245                                      */
0246
0247                                     /**
0248                                      * first: try to get HTML header encoding
0249                                      */
0250                                     if (QTextCodec *codecForHtml = QTextCodec::codecForHtml(m_buffer, nullptr)) {
0251                                         m_codec = codecForHtml;
0252                                     }
0253
0254                                     /**
0255                                      * else: use KEncodingProber
0256                                      */
0257                                     else {
0258                                         KEncodingProber prober(m_proberType);
0259                                         prober.feed(m_buffer.constData(), c);
0260
0261                                         // we found codec with some confidence?
0262                                         if (prober.confidence() > 0.5) {
0263                                             m_codec = QTextCodec::codecForName(prober.encoding());
0264                                         }
0265                                     }
0266
0267                                     // no codec, no chance, encoding error
0268                                     if (!m_codec) {
0269                                         return false;
0270                                     }
0271                                 }
0272                             }
0273
0274                             m_firstRead = false;
0275                         }
0276
0277                         // detect broken encoding, we did before use QTextCodec::ConvertInvalidToNull and check for 0 chars
0278                         // this lead to issues with files containing 0 chars, therefore use the invalidChars field of the state
0279                         Q_ASSERT(m_codec);
0280                         QString unicode = m_codec->toUnicode(m_buffer.constData() + bomBytes, c - bomBytes, m_converterState);
0281                         encodingError = encodingError || m_converterState->invalidChars;
0282                         m_text.append(unicode);
0283                     }
0284
0285                     // is file completely read ?
0286                     m_eof = (c == -1) || (c == 0);
0287
0288                     // recalc current pos and last pos
0289                     m_position -= m_lastLineStart;
0290                     m_lastLineStart = 0;
0291                 }
0292
0293                 // oh oh, end of file, escape !
0294                 if (m_eof && (m_position == m_text.length())) {
0295                     m_lastWasEndOfLine = false;
0296
0297                     // line data
0298                     offset = m_lastLineStart;
0299                     length = m_position - m_lastLineStart;
0300
0301                     m_lastLineStart = m_position;
0302
0303                     return !encodingError && !failedToConvertOnce;
0304                 }
0305
0306                 // empty? try again
0307                 if (m_position == m_text.length()) {
0308                     if (!bomPreviouslyFound && m_bomFound) {
0309                         // BOM was processed above, so we didn't fail to convert
0310                         bomPreviouslyFound = true;
0311                     } else {
0312                         failedToConvertOnce = true;
0313                     }
0314                     continue;
0315                 }
0316             }
0317
0318             QChar current_char = m_text.at(m_position);
0319             if (current_char == lf) {
0320                 m_lastWasEndOfLine = true;
0321
0322                 if (m_lastWasR) {
0323                     m_lastLineStart++;
0324                     m_lastWasR = false;
0325                     m_eol = TextBuffer::eolDos;
0326                 } else {
0327                     // line data
0328                     offset = m_lastLineStart;
0329                     length = m_position - m_lastLineStart;
0330
0331                     m_lastLineStart = m_position + 1;
0332                     m_position++;
0333
0334                     // only win, if not dos!
0335                     if (m_eol != TextBuffer::eolDos) {
0336                         m_eol = TextBuffer::eolUnix;
0337                     }
0338
0339                     return !encodingError;
0340                 }
0341             } else if (current_char == cr) {
0342                 m_lastWasEndOfLine = true;
0343                 m_lastWasR = true;
0344
0345                 // line data
0346                 offset = m_lastLineStart;
0347                 length = m_position - m_lastLineStart;
0348
0349                 m_lastLineStart = m_position + 1;
0350                 m_position++;
0351
0352                 // should only win of first time!
0353                 if (m_eol == TextBuffer::eolUnknown) {
0354                     m_eol = TextBuffer::eolMac;
0355                 }
0356
0357                 return !encodingError;
0358             } else if (current_char == QChar::LineSeparator) {
0359                 m_lastWasEndOfLine = true;
0360
0361                 // line data
0362                 offset = m_lastLineStart;
0363                 length = m_position - m_lastLineStart;
0364
0365                 m_lastLineStart = m_position + 1;
0366                 m_position++;
0367
0368                 return !encodingError;
0369             } else {
0370                 m_lastWasEndOfLine = false;
0371                 m_lastWasR = false;
0372             }
0373
0374             m_position++;
0375         }
0376
0377         return !encodingError;
0378     }
0379
0380     QByteArray digest()
0381     {
0382         return m_digest.result();
0383     }
0384
0385 private:
0386     QTextCodec *m_codec;
0387     bool m_eof;
0388     bool m_lastWasEndOfLine;
0389     bool m_lastWasR;
0390     int m_position;
0391     int m_lastLineStart;
0392     TextBuffer::EndOfLineMode m_eol;
0393     QString m_mimeType;
0394     QIODevice *m_file;
0395     QByteArray m_buffer;
0396     QCryptographicHash m_digest;
0397     QString m_text;
0398     QTextCodec::ConverterState *m_converterState;
0399     bool m_bomFound;
0400     bool m_firstRead;
0401     KEncodingProber::ProberType m_proberType;
0402     quint64 m_fileSize;
0403 };
0404
0405 }
0406
0407 #endif